download popularities from s3 (mdn#3349)

* download popularities from s3 * update docs * lint * cap score * make it a float again
rebloor · Jun 1, 2021 · 41dd29c · 41dd29c
1 parent 50ce6db
commit 41dd29c
Show file tree

Hide file tree

Showing 3 changed files with 100 additions and 51 deletions.
diff --git a/docs/popularities.md b/docs/popularities.md
@@ -1,7 +1,7 @@
 # Popularities
 
-A popular page is one that has lot of pageviews. We get this from Google Analytics.
-Being popular helps search because when a certain search term matches many
+A popular page is one that has lot of pageviews. We get this from our CDN access
+logs. Being popular helps search because when a certain search term matches many
 documents, too many to display all, we need to sort them to try to predict
 which one the user most probably wanted to find.
 
@@ -28,39 +28,93 @@ Where the most popular page is `1`.
 Note that not all documents will have a popularity. So don't expect every known
 URL in the content to appear in the `popularities.json` file.
 
-## How to get the data
+## Where"s the data from
 
-To update the `popularities.json` file, you need to generate an "Unsampled Report"
-in Google Analytics.
-In Google Analytics, go to "Behavior" -> "Site content" -> "All pages". Then,
-click on "Export" (upper right-hand corner) and select "Unsampled report" and leave
-all options to default.
-Once you've done that, it takes a while, but you can now go to "Customization"
--> "Unsampled reports" and there, there should be a report called "Pages". On that
-row there's a "Download" column. Click "CSV" to download the `Pages.csv` file.
-Download that file and save anywhere on your computer.
+Popularities are based on our CDN access logs. We use CloudFront for our CDN.
+Access logs are post processed using an
+[AWS Lambda function](https://github.com/aws-samples/amazon-cloudfront-access-logs-queries).
 
-## Run the CLI tool
+Every month these logs are aggregated by another Lambda called
+`popularitiesCron` using AWS Athena:
+
+```python
+import time
+import boto3
+
+from datetime import datetime, timezone, timedelta
+
+last_month = datetime.now(timezone.utc) - timedelta(weeks=1)
 
-Once you have the `Pages.csv` file run:
+month = "{:0>2}".format(last_month.month)
+year = "{}".format(last_month.year)
+
+query = """
+SELECT u AS Page,
+         count(*) AS Pageviews
+FROM
+    (SELECT replace(uri,
+         '/index.json', '') AS u
+    FROM partitioned_parquet
+    WHERE year = '{}'
+            AND month = '{}'
+            AND status = 200
+            AND user_agent LIKE 'Mozilla%'
+            AND uri NOT LIKE '%/_samples_/%'
+            AND (uri LIKE '/%/docs/%'
+            AND sc_content_type = 'text/html; charset=utf-8'
+            OR uri LIKE '/%/docs/%/index.json'))
+GROUP BY  u
+ORDER BY  Pageviews DESC
+""".format(year, month)
+
+DATABASE = 'yariprod_cf_access_logs_db'
+output='s3://mdn-popularities-prod/{}/{}/'.format(year, month)
+
+def lambda_handler(event, context):
+    client = boto3.client('athena')
+    response = client.start_query_execution(
+        QueryString=query,
+        QueryExecutionContext={
+            'Database': DATABASE
+        },
+        ResultConfiguration={
+            'OutputLocation': output,
+        }
+    )
+    s3 = boto3.resource('s3')
+    uuid=response["QueryExecutionId"]
+    if uuid:
+        content = (
+          "https://mdn-popularities-prod.s3.amazonaws.com/"
+          "{year}/{month}/{uuid}.csv"
+        ).format(year=year, month=month, uuid=uuid)
+        s3.Object(
+          'mdn-popularities-prod', 'current.txt'
+        ).put(Body=content, ContentType="text/plain; charset=utf-8")
+    return response
+```
+
+This is trigger at via a CloudWatch cron job (`popularities-cron-trigger`) every
+1st of the month.
+
+Output is stored in an S3 bucket named `mdn-popularities-prod`.
+<'s3://mdn-popularities-prod/current.txt> points to the current file.
+
+## Run the CLI tool
 
 ```bash
-yarn tool popularities ~/Downloads/Pages.csv
+yarn tool popularities
 ```
 
-This should now update the file `files/popularities.json` in your `mdn/content`
-repo. It takes the value of the `CONTENT_ROOT` constant.
+This should now download the latest popularities csv and update the file
+`files/popularities.json` in your `mdn/content` repo. It takes the value of the
+`CONTENT_ROOT` constant.
 
 Once you've done this, you need to make a pull request on the new `mdn/content`
 repo.
 
 ## The future
 
-We have talked about automating this. Not only is it very clunky to have to
-use the Google Analytics web app to get the report, it's also only a matter
-of time till it's out of date. And if a new page is introduced, since the last
-time you generated a report, it will be "unfavored" in search.
-
 One idea would be that we instead use Kuma to collect this. Then Yari could
 download it from Kuma right before the build starts. If we do this we would
 fully automate everything and the data would be more up-to-date.
diff --git a/tool/cli.js b/tool/cli.js
@@ -634,37 +634,21 @@ program
 
   .command(
     "popularities",
-    "Convert a Google Analytics pageviews CSV into a popularities.json file"
+    "Convert an AWS Athena log aggregation CSV into a popularities.json file"
   )
-  .option(
-    "--outfile <path>",
-    "export from Google Analytics containing pageview counts",
-    {
-      default: path.join(CONTENT_ROOT, "popularities.json"),
-    }
-  )
-  .option(
-    "--max-uris <number>",
-    "export from Google Analytics containing pageview counts",
-    {
-      default: MAX_GOOGLE_ANALYTICS_URIS,
-    }
-  )
-  .argument("csvfile", "Google Analytics pageviews CSV file", {
-    validator: (value) => {
-      if (!fs.existsSync(value)) {
-        throw new Error(`${value} does not exist`);
-      }
-      return value;
-    },
+  .option("--outfile <path>", "output file", {
+    default: path.join(CONTENT_ROOT, "popularities.json"),
+  })
+  .option("--max-uris <number>", "limit to top <number> entries", {
+    default: MAX_GOOGLE_ANALYTICS_URIS,
   })
   .action(
-    tryOrExit(async ({ args, options, logger }) => {
+    tryOrExit(async ({ options, logger }) => {
       const {
         rowCount,
         popularities,
         pageviews,
-      } = await runMakePopularitiesFile(args.csvfile, options);
+      } = await runMakePopularitiesFile(options);
       logger.info(chalk.green(`Parsed ${rowCount.toLocaleString()} rows.`));
 
       const numberKeys = Object.keys(popularities).length;

diff --git a/tool/popularities.js b/tool/popularities.js
@@ -1,7 +1,7 @@
 /**
  * This script exists only to periodically generate a
- * "content/popularities.json" file from a Google Analytics pageviews CSV
- * export.
+ * "content/popularities.json" file from a Cloudfront access CSV export.
+ *
  * Generally, only the core MDN Web Docs team needs to run this. The output
  * file gets checked into git so it's easily available to everyone.
  *
@@ -12,14 +12,25 @@
 const fs = require("fs");
 
 const csv = require("@fast-csv/parse");
+const got = require("got");
+
+const CURRENT_URL =
+  "https://mdn-popularities-prod.s3.amazonaws.com/current.txt";
+
+async function fetchPopularities() {
+  let { body: csvURL } = await got(CURRENT_URL);
+  let { body: csv } = await got(csvURL);
+  return csv;
+}
 
-function runMakePopularitiesFile(filepath, options) {
+async function runMakePopularitiesFile(options) {
   const { outfile, maxUris } = options;
   const pageviews = [];
   let biggestCount = null;
+  const raw = await fetchPopularities();
   return new Promise((resolve, reject) => {
     csv
-      .parseFile(filepath, {
+      .parseString(raw, {
         headers: true,
       })
       .on("error", (error) => console.error(error))
@@ -51,7 +62,7 @@ function runMakePopularitiesFile(filepath, options) {
         }
         const popularities = {};
         pageviews.slice(0, maxUris).forEach(([uri, popularity]) => {
-          popularities[uri] = popularity;
+          popularities[uri] = parseFloat(popularity.toFixed(5));
         });
         fs.writeFileSync(outfile, JSON.stringify(popularities, null, 2));
         resolve({ rowCount, popularities, pageviews });