From 51fb24ab730f3b09d78e200f020b01974dc9e457 Mon Sep 17 00:00:00 2001 From: Kevin Morris <kevr@0cost.org> Date: Sun, 31 Oct 2021 16:52:30 -0700 Subject: [PATCH] fix(mkpkglists): improve package meta archive The SQL logic in this file for package metadata now exactly reflects RPC's search logic, without searching for specific packages. Two command line arguments are available: --extended | Include License, Keywords, Groups, relations and dependencies. When --extended is passed, the script will create a packages-meta-ext-v1.json.gz, configured via packagesmetaextfile. Archive JSON is in the following format: line-separated package objects enclosed in a list: [ {...}, {...}, {...} ] Signed-off-by: Kevin Morris <kevr@0cost.org> --- .gitlab-ci.yml | 2 +- INSTALL | 3 +- aurweb/scripts/mkpkglists.py | 279 +++++++++++++++++++++++++++++++---- conf/config.defaults | 1 + test/setup.sh | 2 + web/html/index.php | 3 +- 6 files changed, 261 insertions(+), 29 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index aff18a839..ce3740827 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,7 +12,7 @@ before_script: python-pygit2 python-srcinfo python-bleach python-markdown python-sqlalchemy python-alembic python-pytest python-werkzeug python-pytest-tap python-fastapi hypercorn nginx python-authlib - python-itsdangerous python-httpx + python-itsdangerous python-httpx python-orjson test: script: diff --git a/INSTALL b/INSTALL index 9bcd0759c..dc9cc51f3 100644 --- a/INSTALL +++ b/INSTALL @@ -49,7 +49,8 @@ read the instructions below. # pacman -S python-mysql-connector python-pygit2 python-srcinfo python-sqlalchemy \ python-bleach python-markdown python-alembic python-jinja \ - python-itsdangerous python-authlib python-httpx hypercorn + python-itsdangerous python-authlib python-httpx hypercorn \ + python-orjson # python3 setup.py install 5) Create a new MySQL database and a user and import the aurweb SQL schema: diff --git a/aurweb/scripts/mkpkglists.py b/aurweb/scripts/mkpkglists.py index c73cc3be2..f2095a20b 100755 --- a/aurweb/scripts/mkpkglists.py +++ b/aurweb/scripts/mkpkglists.py @@ -1,16 +1,192 @@ #!/usr/bin/env python3 +""" +Produces package, package base and user archives for the AUR +database. + +Archives: + + packages.gz | A line-separated list of package names + packages-meta-v1.json | A type=search RPC-formatted JSON dataset + packages-meta-ext-v1.json | An --extended archive + pkgbase.gz | A line-separated list of package base names + users.gz | A line-separated list of user names + +This script takes an optional argument: --extended. Based +on the following, right-hand side fields are added to each item. + + --extended | License, Keywords, Groups, relations and dependencies + +""" import datetime import gzip -import json +import os +import sys + +from collections import defaultdict +from decimal import Decimal +from typing import Tuple + +import orjson import aurweb.config import aurweb.db + +def state_path(archive: str) -> str: + # A hard-coded /tmp state directory. + # TODO: Use Redis cache to store this state after we merge + # FastAPI into master and removed PHP from the tree. + return os.path.join("/tmp", os.path.basename(archive) + ".state") + + packagesfile = aurweb.config.get('mkpkglists', 'packagesfile') packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile') +packagesmetaextfile = aurweb.config.get('mkpkglists', 'packagesmetaextfile') +packages_state = state_path(packagesfile) + pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile') +pkgbases_state = state_path(pkgbasefile) + userfile = aurweb.config.get('mkpkglists', 'userfile') +users_state = state_path(userfile) + + +def should_update(state: str, tablename: str) -> Tuple[bool, int]: + if aurweb.config.get("database", "backend") != "mysql": + return (False, 0) + + db_name = aurweb.config.get("database", "name") + conn = aurweb.db.Connection() + cur = conn.execute("SELECT auto_increment FROM information_schema.tables " + "WHERE table_schema = ? AND table_name = ?", + (db_name, tablename,)) + update_time = cur.fetchone()[0] + + saved_update_time = 0 + if os.path.exists(state): + with open(state) as f: + saved_update_time = int(f.read().strip()) + + return (saved_update_time == update_time, update_time) + + +def update_state(state: str, update_time: int) -> None: + with open(state, "w") as f: + f.write(str(update_time)) + + +TYPE_MAP = { + "depends": "Depends", + "makedepends": "MakeDepends", + "checkdepends": "CheckDepends", + "optdepends": "OptDepends", + "conflicts": "Conflicts", + "provides": "Provides", + "replaces": "Replaces", +} + + +def get_extended_dict(query: str): + """ + Produce data in the form in a single bulk SQL query: + + { + <integer_package_id>: { + "Depends": [...], + "Conflicts": [...], + "License": [...] + } + } + + The caller can then use this data to populate a dataset of packages. + + output = produce_base_output_data() + data = get_extended_dict(query) + for i in range(len(output)): + package_id = output[i].get("ID") + output[i].update(data.get(package_id)) + """ + + conn = aurweb.db.Connection() + + cursor = conn.execute(query) + + data = defaultdict(lambda: defaultdict(list)) + + for result in cursor.fetchall(): + + pkgid = result[0] + key = TYPE_MAP.get(result[1]) + output = result[2] + if result[3]: + output += result[3] + + # In all cases, we have at least an empty License list. + if "License" not in data[pkgid]: + data[pkgid]["License"] = [] + + # In all cases, we have at least an empty Keywords list. + if "Keywords" not in data[pkgid]: + data[pkgid]["Keywords"] = [] + + data[pkgid][key].append(output) + + conn.close() + return data + + +def get_extended_fields(): + # Returns: [ID, Type, Name, Cond] + query = """ + SELECT PackageDepends.PackageID AS ID, DependencyTypes.Name AS Type, + PackageDepends.DepName AS Name, PackageDepends.DepCondition AS Cond + FROM PackageDepends + LEFT JOIN DependencyTypes + ON DependencyTypes.ID = PackageDepends.DepTypeID + UNION SELECT PackageRelations.PackageID AS ID, RelationTypes.Name AS Type, + PackageRelations.RelName AS Name, + PackageRelations.RelCondition AS Cond + FROM PackageRelations + LEFT JOIN RelationTypes + ON RelationTypes.ID = PackageRelations.RelTypeID + UNION SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type, + Groups.Name, '' AS Cond + FROM Groups + INNER JOIN PackageGroups ON PackageGroups.GroupID = Groups.ID + UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type, + Licenses.Name, '' as Cond + FROM Licenses + INNER JOIN PackageLicenses ON PackageLicenses.LicenseID = Licenses.ID + UNION SELECT Packages.ID AS ID, 'Keywords' AS Type, + PackageKeywords.Keyword AS Name, '' as Cond + FROM PackageKeywords + INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID + """ + return get_extended_dict(query) + + +EXTENDED_FIELD_HANDLERS = { + "--extended": get_extended_fields +} + + +def is_decimal(column): + """ Check if an SQL column is of decimal.Decimal type. """ + if isinstance(column, Decimal): + return float(column) + return column + + +def write_archive(archive: str, output: list): + with gzip.open(archive, "wb") as f: + f.write(b"[\n") + for i, item in enumerate(output): + f.write(orjson.dumps(item)) + if i < len(output) - 1: + f.write(b",") + f.write(b"\n") + f.write(b"]") def main(): @@ -21,32 +197,83 @@ def main(): pkgbaselist_header = "# AUR package base list, generated on " + datestr userlist_header = "# AUR user name list, generated on " + datestr - with gzip.open(packagesfile, "w") as f: - f.write(bytes(pkglist_header + "\n", "UTF-8")) - cur = conn.execute("SELECT Packages.Name FROM Packages " + - "INNER JOIN PackageBases " + - "ON PackageBases.ID = Packages.PackageBaseID " + + updated, update_time = should_update(packages_state, "Packages") + if not updated: + print("Updating Packages...") + + # Query columns; copied from RPC. + columns = ("Packages.ID, Packages.Name, " + "PackageBases.ID AS PackageBaseID, " + "PackageBases.Name AS PackageBase, " + "Version, Description, URL, NumVotes, " + "Popularity, OutOfDateTS AS OutOfDate, " + "Users.UserName AS Maintainer, " + "SubmittedTS AS FirstSubmitted, " + "ModifiedTS AS LastModified") + + # Perform query. + cur = conn.execute(f"SELECT {columns} FROM Packages " + "LEFT JOIN PackageBases " + "ON PackageBases.ID = Packages.PackageBaseID " + "LEFT JOIN Users " + "ON PackageBases.MaintainerUID = Users.ID " "WHERE PackageBases.PackagerUID IS NOT NULL") - f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()]) - - with gzip.open(packagesmetafile, "wt") as f: - cur = conn.execute("SELECT * FROM Packages") - json.dump({ - "warning": "This is a experimental! It can be removed or modified without warning!", - "columns": [d[0] for d in cur.description], - "data": cur.fetchall() - }, f) - - with gzip.open(pkgbasefile, "w") as f: - f.write(bytes(pkgbaselist_header + "\n", "UTF-8")) - cur = conn.execute("SELECT Name FROM PackageBases " + - "WHERE PackagerUID IS NOT NULL") - f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()]) - - with gzip.open(userfile, "w") as f: - f.write(bytes(userlist_header + "\n", "UTF-8")) - cur = conn.execute("SELECT UserName FROM Users") - f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()]) + + # Produce packages-meta-v1.json.gz + output = list() + snapshot_uri = aurweb.config.get("options", "snapshot_uri") + for result in cur.fetchall(): + item = { + column[0]: is_decimal(result[i]) + for i, column in enumerate(cur.description) + } + item["URLPath"] = snapshot_uri % item.get("Name") + output.append(item) + + write_archive(packagesmetafile, output) + + # Produce packages-meta-ext-v1.json.gz + if len(sys.argv) > 1 and sys.argv[1] in EXTENDED_FIELD_HANDLERS: + f = EXTENDED_FIELD_HANDLERS.get(sys.argv[1]) + data = f() + + default_ = {"Groups": [], "License": [], "Keywords": []} + for i in range(len(output)): + data_ = data.get(output[i].get("ID"), default_) + output[i].update(data_) + + write_archive(packagesmetaextfile, output) + + # Produce packages.gz + with gzip.open(packagesfile, "wb") as f: + f.write(bytes(pkglist_header + "\n", "UTF-8")) + f.writelines([ + bytes(x.get("Name") + "\n", "UTF-8") + for x in output + ]) + + update_state(packages_state, update_time) + + updated, update_time = should_update(pkgbases_state, "PackageBases") + if not updated: + print("Updating PackageBases...") + # Produce pkgbase.gz + with gzip.open(pkgbasefile, "w") as f: + f.write(bytes(pkgbaselist_header + "\n", "UTF-8")) + cur = conn.execute("SELECT Name FROM PackageBases " + + "WHERE PackagerUID IS NOT NULL") + f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()]) + update_state(pkgbases_state, update_time) + + updated, update_time = should_update(users_state, "Users") + if not updated: + print("Updating Users...") + # Produce users.gz + with gzip.open(userfile, "w") as f: + f.write(bytes(userlist_header + "\n", "UTF-8")) + cur = conn.execute("SELECT UserName FROM Users") + f.writelines([bytes(x[0] + "\n", "UTF-8") for x in cur.fetchall()]) + update_state(users_state, update_time) conn.close() diff --git a/conf/config.defaults b/conf/config.defaults index 36ea02efb..a04f21bcc 100644 --- a/conf/config.defaults +++ b/conf/config.defaults @@ -93,5 +93,6 @@ server = ftp://mirrors.kernel.org/archlinux/%s/os/x86_64 [mkpkglists] packagesfile = /srv/http/aurweb/web/html/packages.gz packagesmetafile = /srv/http/aurweb/web/html/packages-meta-v1.json.gz +packagesmetaextfile = /srv/http/aurweb/web/html/packages-meta-ext-v1.json.gz pkgbasefile = /srv/http/aurweb/web/html/pkgbase.gz userfile = /srv/http/aurweb/web/html/users.gz diff --git a/test/setup.sh b/test/setup.sh index 24bb5f48a..f74cd1b7d 100644 --- a/test/setup.sh +++ b/test/setup.sh @@ -31,6 +31,7 @@ enable-maintenance = 0 maintenance-exceptions = 127.0.0.1 commit_uri = https://aur.archlinux.org/cgit/aur.git/log/?h=%s&id=%s localedir = $TOPLEVEL/web/locale/ +snapshot_uri = /cgit/aur.git/snapshot/%s.tar.gz [notifications] notify-cmd = $NOTIFY @@ -62,6 +63,7 @@ server = file://$(pwd)/remote/ [mkpkglists] packagesfile = packages.gz packagesmetafile = packages-meta-v1.json.gz +packagesmetaextfile = packages-meta-ext-v1.json.gz pkgbasefile = pkgbase.gz userfile = users.gz EOF diff --git a/web/html/index.php b/web/html/index.php index 3163c3e87..dc435162d 100644 --- a/web/html/index.php +++ b/web/html/index.php @@ -189,7 +189,8 @@ if (!empty($tokens[1]) && '/' . $tokens[1] == get_pkg_route()) { readfile("./$path"); break; case "/packages.gz": - case "/packages-teapot.json.gz": + case "/packages-meta-v1.json.gz": + case "/packages-meta-ext-v1.json.gz": case "/pkgbase.gz": case "/users.gz": header("Content-Type: text/plain"); -- GitLab