From 51fb24ab730f3b09d78e200f020b01974dc9e457 Mon Sep 17 00:00:00 2001
From: Kevin Morris <kevr@0cost.org>
Date: Sun, 31 Oct 2021 16:52:30 -0700
Subject: [PATCH] fix(mkpkglists): improve package meta archive

The SQL logic in this file for package metadata now exactly
reflects RPC's search logic, without searching for specific
packages.

Two command line arguments are available:

    --extended | Include License, Keywords, Groups, relations
                 and dependencies.

When --extended is passed, the script will create a
packages-meta-ext-v1.json.gz, configured via packagesmetaextfile.

Archive JSON is in the following format: line-separated package objects
enclosed in a list:

    [
    {...},
    {...},
    {...}
    ]

Signed-off-by: Kevin Morris <kevr@0cost.org>
---
 .gitlab-ci.yml               |   2 +-
 INSTALL                      |   3 +-
 aurweb/scripts/mkpkglists.py | 279 +++++++++++++++++++++++++++++++----
 conf/config.defaults         |   1 +
 test/setup.sh                |   2 +
 web/html/index.php           |   3 +-
 6 files changed, 261 insertions(+), 29 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index aff18a839..ce3740827 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,7 +12,7 @@ before_script:
            python-pygit2 python-srcinfo python-bleach python-markdown
            python-sqlalchemy python-alembic python-pytest python-werkzeug
            python-pytest-tap python-fastapi hypercorn nginx python-authlib
-           python-itsdangerous python-httpx
+           python-itsdangerous python-httpx python-orjson
 
 test:
   script:
diff --git a/INSTALL b/INSTALL
index 9bcd0759c..dc9cc51f3 100644
--- a/INSTALL
+++ b/INSTALL
@@ -49,7 +49,8 @@ read the instructions below.
 
     # pacman -S python-mysql-connector python-pygit2 python-srcinfo python-sqlalchemy \
                 python-bleach python-markdown python-alembic python-jinja \
-                python-itsdangerous python-authlib python-httpx hypercorn
+                python-itsdangerous python-authlib python-httpx hypercorn \
+                python-orjson
     # python3 setup.py install
 
 5) Create a new MySQL database and a user and import the aurweb SQL schema:
diff --git a/aurweb/scripts/mkpkglists.py b/aurweb/scripts/mkpkglists.py
index c73cc3be2..f2095a20b 100755
--- a/aurweb/scripts/mkpkglists.py
+++ b/aurweb/scripts/mkpkglists.py
@@ -1,16 +1,192 @@
 #!/usr/bin/env python3
+"""
+Produces package, package base and user archives for the AUR
+database.
+
+Archives:
+
+    packages.gz               | A line-separated list of package names
+    packages-meta-v1.json     | A type=search RPC-formatted JSON dataset
+    packages-meta-ext-v1.json | An --extended archive
+    pkgbase.gz                | A line-separated list of package base names
+    users.gz                  | A line-separated list of user names
+
+This script takes an optional argument: --extended. Based
+on the following, right-hand side fields are added to each item.
+
+    --extended  | License, Keywords, Groups, relations and dependencies
+
+"""
 
 import datetime
 import gzip
-import json
+import os
+import sys
+
+from collections import defaultdict
+from decimal import Decimal
+from typing import Tuple
+
+import orjson
 
 import aurweb.config
 import aurweb.db
 
+
+def state_path(archive: str) -> str:
+    # A hard-coded /tmp state directory.
+    # TODO: Use Redis cache to store this state after we merge
+    # FastAPI into master and removed PHP from the tree.
+    return os.path.join("/tmp", os.path.basename(archive) + ".state")
+
+
 packagesfile = aurweb.config.get('mkpkglists', 'packagesfile')
 packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile')
+packagesmetaextfile = aurweb.config.get('mkpkglists', 'packagesmetaextfile')
+packages_state = state_path(packagesfile)
+
 pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile')
+pkgbases_state = state_path(pkgbasefile)
+
 userfile = aurweb.config.get('mkpkglists', 'userfile')
+users_state = state_path(userfile)
+
+
+def should_update(state: str, tablename: str) -> Tuple[bool, int]:
+    if aurweb.config.get("database", "backend") != "mysql":
+        return (False, 0)
+
+    db_name = aurweb.config.get("database", "name")
+    conn = aurweb.db.Connection()
+    cur = conn.execute("SELECT auto_increment FROM information_schema.tables "
+                       "WHERE table_schema = ? AND table_name = ?",
+                       (db_name, tablename,))
+    update_time = cur.fetchone()[0]
+
+    saved_update_time = 0
+    if os.path.exists(state):
+        with open(state) as f:
+            saved_update_time = int(f.read().strip())
+
+    return (saved_update_time == update_time, update_time)
+
+
+def update_state(state: str, update_time: int) -> None:
+    with open(state, "w") as f:
+        f.write(str(update_time))
+
+
+TYPE_MAP = {
+    "depends": "Depends",
+    "makedepends": "MakeDepends",
+    "checkdepends": "CheckDepends",
+    "optdepends": "OptDepends",
+    "conflicts": "Conflicts",
+    "provides": "Provides",
+    "replaces": "Replaces",
+}
+
+
+def get_extended_dict(query: str):
+    """
+    Produce data in the form in a single bulk SQL query:
+
+    {
+        <integer_package_id>: {
+            "Depends": [...],
+            "Conflicts": [...],
+            "License": [...]
+        }
+    }
+
+    The caller can then use this data to populate a dataset of packages.
+
+    output = produce_base_output_data()
+    data = get_extended_dict(query)
+    for i in range(len(output)):
+        package_id = output[i].get("ID")
+        output[i].update(data.get(package_id))
+    """
+
+    conn = aurweb.db.Connection()
+
+    cursor = conn.execute(query)
+
+    data = defaultdict(lambda: defaultdict(list))
+
+    for result in cursor.fetchall():
+
+        pkgid = result[0]
+        key = TYPE_MAP.get(result[1])
+        output = result[2]
+        if result[3]:
+            output += result[3]
+
+        # In all cases, we have at least an empty License list.
+        if "License" not in data[pkgid]:
+            data[pkgid]["License"] = []
+
+        # In all cases, we have at least an empty Keywords list.
+        if "Keywords" not in data[pkgid]:
+            data[pkgid]["Keywords"] = []
+
+        data[pkgid][key].append(output)
+
+    conn.close()
+    return data
+
+
+def get_extended_fields():
+    # Returns: [ID, Type, Name, Cond]
+    query = """
+    SELECT PackageDepends.PackageID AS ID, DependencyTypes.Name AS Type,
+           PackageDepends.DepName AS Name, PackageDepends.DepCondition AS Cond
+    FROM PackageDepends
+    LEFT JOIN DependencyTypes
+    ON DependencyTypes.ID = PackageDepends.DepTypeID
+    UNION SELECT PackageRelations.PackageID AS ID, RelationTypes.Name AS Type,
+          PackageRelations.RelName AS Name,
+          PackageRelations.RelCondition AS Cond
+    FROM PackageRelations
+    LEFT JOIN RelationTypes
+    ON RelationTypes.ID = PackageRelations.RelTypeID
+    UNION SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type,
+          Groups.Name, '' AS Cond
+    FROM Groups
+    INNER JOIN PackageGroups ON PackageGroups.GroupID = Groups.ID
+    UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type,
+          Licenses.Name, '' as Cond
+    FROM Licenses
+    INNER JOIN PackageLicenses ON PackageLicenses.LicenseID = Licenses.ID
+    UNION SELECT Packages.ID AS ID, 'Keywords' AS Type,
+          PackageKeywords.Keyword AS Name, '' as Cond
+    FROM PackageKeywords
+    INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID
+    """
+    return get_extended_dict(query)
+
+
+EXTENDED_FIELD_HANDLERS = {
+    "--extended": get_extended_fields
+}
+
+
+def is_decimal(column):
+    """ Check if an SQL column is of decimal.Decimal type. """
+    if isinstance(column, Decimal):
+        return float(column)
+    return column
+
+
+def write_archive(archive: str, output: list):
+    with gzip.open(archive, "wb") as f:
+        f.write(b"[\n")
+        for i, item in enumerate(output):
+            f.write(orjson.dumps(item))
+            if i < len(output) - 1:
+                f.write(b",")
+            f.write(b"\n")
+        f.write(b"]")
 
 
 def main():
@@ -21,32 +197,83 @@ def main():
     pkgbaselist_header = "# AUR package base list, generated on " + datestr
     userlist_header = "# AUR user name list, generated on " + datestr
 
-    with gzip.open(packagesfile, "w") as f:
-        f.write(bytes(pkglist_header + "\n", "UTF-8"))
-        cur = conn.execute("SELECT Packages.Name FROM Packages " +
-                           "INNER JOIN PackageBases " +
-                           "ON PackageBases.ID = Packages.PackageBaseID " +
+    updated, update_time = should_update(packages_state, "Packages")
+    if not updated:
+        print("Updating Packages...")
+
+        # Query columns; copied from RPC.
+        columns = ("Packages.ID, Packages.Name, "
+                   "PackageBases.ID AS PackageBaseID, "
+                   "PackageBases.Name AS PackageBase, "
+                   "Version, Description, URL, NumVotes, "
+                   "Popularity, OutOfDateTS AS OutOfDate, "
+                   "Users.UserName AS Maintainer, "
+                   "SubmittedTS AS FirstSubmitted, "
+                   "ModifiedTS AS LastModified")
+
+        # Perform query.
+        cur = conn.execute(f"SELECT {columns} FROM Packages "
+                           "LEFT JOIN PackageBases "
+                           "ON PackageBases.ID = Packages.PackageBaseID "
+                           "LEFT JOIN Users "
+                           "ON PackageBases.MaintainerUID = Users.ID "
                            "WHERE PackageBases.PackagerUID IS NOT NULL")
-        f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
-
-    with gzip.open(packagesmetafile, "wt") as f:
-        cur = conn.execute("SELECT * FROM Packages")
-        json.dump({
-            "warning": "This is a experimental! It can be removed or modified without warning!",
-            "columns": [d[0] for d in cur.description],
-            "data": cur.fetchall()
-        }, f)
-
-    with gzip.open(pkgbasefile, "w") as f:
-        f.write(bytes(pkgbaselist_header + "\n", "UTF-8"))
-        cur = conn.execute("SELECT Name FROM PackageBases " +
-                           "WHERE PackagerUID IS NOT NULL")
-        f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
-
-    with gzip.open(userfile, "w") as f:
-        f.write(bytes(userlist_header + "\n", "UTF-8"))
-        cur = conn.execute("SELECT UserName FROM Users")
-        f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
+
+        # Produce packages-meta-v1.json.gz
+        output = list()
+        snapshot_uri = aurweb.config.get("options", "snapshot_uri")
+        for result in cur.fetchall():
+            item = {
+                column[0]: is_decimal(result[i])
+                for i, column in enumerate(cur.description)
+            }
+            item["URLPath"] = snapshot_uri % item.get("Name")
+            output.append(item)
+
+        write_archive(packagesmetafile, output)
+
+        # Produce packages-meta-ext-v1.json.gz
+        if len(sys.argv) > 1 and sys.argv[1] in EXTENDED_FIELD_HANDLERS:
+            f = EXTENDED_FIELD_HANDLERS.get(sys.argv[1])
+            data = f()
+
+            default_ = {"Groups": [], "License": [], "Keywords": []}
+            for i in range(len(output)):
+                data_ = data.get(output[i].get("ID"), default_)
+                output[i].update(data_)
+
+            write_archive(packagesmetaextfile, output)
+
+        # Produce packages.gz
+        with gzip.open(packagesfile, "wb") as f:
+            f.write(bytes(pkglist_header + "\n", "UTF-8"))
+            f.writelines([
+                bytes(x.get("Name") + "\n", "UTF-8")
+                for x in output
+            ])
+
+        update_state(packages_state, update_time)
+
+    updated, update_time = should_update(pkgbases_state, "PackageBases")
+    if not updated:
+        print("Updating PackageBases...")
+        # Produce pkgbase.gz
+        with gzip.open(pkgbasefile, "w") as f:
+            f.write(bytes(pkgbaselist_header + "\n", "UTF-8"))
+            cur = conn.execute("SELECT Name FROM PackageBases " +
+                               "WHERE PackagerUID IS NOT NULL")
+            f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
+        update_state(pkgbases_state, update_time)
+
+    updated, update_time = should_update(users_state, "Users")
+    if not updated:
+        print("Updating Users...")
+        # Produce users.gz
+        with gzip.open(userfile, "w") as f:
+            f.write(bytes(userlist_header + "\n", "UTF-8"))
+            cur = conn.execute("SELECT UserName FROM Users")
+            f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()])
+        update_state(users_state, update_time)
 
     conn.close()
 
diff --git a/conf/config.defaults b/conf/config.defaults
index 36ea02efb..a04f21bcc 100644
--- a/conf/config.defaults
+++ b/conf/config.defaults
@@ -93,5 +93,6 @@ server = ftp://mirrors.kernel.org/archlinux/%s/os/x86_64
 [mkpkglists]
 packagesfile = /srv/http/aurweb/web/html/packages.gz
 packagesmetafile = /srv/http/aurweb/web/html/packages-meta-v1.json.gz
+packagesmetaextfile = /srv/http/aurweb/web/html/packages-meta-ext-v1.json.gz
 pkgbasefile = /srv/http/aurweb/web/html/pkgbase.gz
 userfile = /srv/http/aurweb/web/html/users.gz
diff --git a/test/setup.sh b/test/setup.sh
index 24bb5f48a..f74cd1b7d 100644
--- a/test/setup.sh
+++ b/test/setup.sh
@@ -31,6 +31,7 @@ enable-maintenance = 0
 maintenance-exceptions = 127.0.0.1
 commit_uri = https://aur.archlinux.org/cgit/aur.git/log/?h=%s&id=%s
 localedir = $TOPLEVEL/web/locale/
+snapshot_uri = /cgit/aur.git/snapshot/%s.tar.gz
 
 [notifications]
 notify-cmd = $NOTIFY
@@ -62,6 +63,7 @@ server = file://$(pwd)/remote/
 [mkpkglists]
 packagesfile = packages.gz
 packagesmetafile = packages-meta-v1.json.gz
+packagesmetaextfile = packages-meta-ext-v1.json.gz
 pkgbasefile = pkgbase.gz
 userfile = users.gz
 EOF
diff --git a/web/html/index.php b/web/html/index.php
index 3163c3e87..dc435162d 100644
--- a/web/html/index.php
+++ b/web/html/index.php
@@ -189,7 +189,8 @@ if (!empty($tokens[1]) && '/' . $tokens[1] == get_pkg_route()) {
 		readfile("./$path");
 		break;
 	case "/packages.gz":
-	case "/packages-teapot.json.gz":
+	case "/packages-meta-v1.json.gz":
+	case "/packages-meta-ext-v1.json.gz":
 	case "/pkgbase.gz":
 	case "/users.gz":
 		header("Content-Type: text/plain");
-- 
GitLab