From e5cd3e1e02607b2e8499bb4e343fe810afe46b1c Mon Sep 17 00:00:00 2001 From: Kevin Morris <kevr@0cost.org> Date: Tue, 2 Nov 2021 00:33:34 -0700 Subject: [PATCH] fix(mkpkglists): add all RPC type=search fields to packages-meta-v1 The SQL logic in this file for package metadata now exactly reflects RPC's search logic, without searching for specific packages. Two command line arguments are available: --extended | Include License, Keywords and Groups fields --all | Include License, Keywords, Groups, dependencies and relations When one of these arguments are passed, the script will create a packages-meta-ext-v1.json.gz, configured via packagesmetaextfile. Done this way so that there are multiple options for archive; the roughly 2.4MB basic metadata (-meta-v1) and the ~4-7MB (--extended) / 9.8MB (--all) metadata (-meta-ext-v1). Archive JSON is in the following format: line-separated package objects enclosed in a list: [ {...}, {...}, {...} ] This allows users to stream the archive and process packages line-by-line, during the local storage process. Signed-off-by: Kevin Morris <kevr@0cost.org> --- .gitlab-ci.yml | 2 +- INSTALL | 2 +- aurweb/scripts/mkpkglists.py | 241 +++++++++++++++++++++++++++++------ conf/config.defaults | 1 + test/setup.sh | 2 + 5 files changed, 209 insertions(+), 39 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index aff18a839..ce3740827 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,7 +12,7 @@ before_script: python-pygit2 python-srcinfo python-bleach python-markdown python-sqlalchemy python-alembic python-pytest python-werkzeug python-pytest-tap python-fastapi hypercorn nginx python-authlib - python-itsdangerous python-httpx + python-itsdangerous python-httpx python-orjson test: script: diff --git a/INSTALL b/INSTALL index b161edd2d..b0989f48e 100644 --- a/INSTALL +++ b/INSTALL @@ -50,7 +50,7 @@ read the instructions below. # pacman -S python-mysql-connector python-pygit2 python-srcinfo python-sqlalchemy \ python-bleach python-markdown python-alembic python-jinja \ python-itsdangerous python-authlib python-httpx hypercorn \ - composer + composer python-orjson # python3 setup.py install 4a) Install `composer` dependencies while inside of aurweb's root: diff --git a/aurweb/scripts/mkpkglists.py b/aurweb/scripts/mkpkglists.py index 646d08650..16fe752e1 100755 --- a/aurweb/scripts/mkpkglists.py +++ b/aurweb/scripts/mkpkglists.py @@ -1,12 +1,35 @@ #!/usr/bin/env python3 +""" +Produces package, package base and user archives for the AUR +database. + +Archives: + + packages.gz | A line-separated list of package names + packages-meta-v1.json | A type=search RPC-formatted JSON dataset + packages-meta-ext-v1.json | An --extended or --all archive + pkgbase.gz | A line-separated list of package base names + users.gz | A line-separated list of user names + +This script takes an optional argument: --extended or --all. Based +on the following, right-hand side fields are added to each item. + + --extended | License, Keywords, Groups + --all | License, Keywords, Groups, relations and dependencies + +""" import datetime import gzip -import json import os +import sys +from collections import defaultdict +from decimal import Decimal from typing import Tuple +import orjson + import aurweb.config import aurweb.db @@ -20,6 +43,7 @@ def state_path(archive: str) -> str: packagesfile = aurweb.config.get('mkpkglists', 'packagesfile') packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile') +packagesmetaextfile = aurweb.config.get('mkpkglists', 'packagesmetaextfile') packages_state = state_path(packagesfile) pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile') @@ -53,6 +77,142 @@ def update_state(state: str, update_time: int) -> None: f.write(str(update_time)) +TYPE_MAP = { + "depends": "Depends", + "makedepends": "MakeDepends", + "checkdepends": "CheckDepends", + "optdepends": "OptDepends", + "conflicts": "Conflicts", + "provides": "Provides", + "replaces": "Replaces", +} + + +def get_extended_dict(query: str): + """ + Produce data in the form in a single bulk SQL query: + + { + <integer_package_id>: { + "Depends": [...], + "Conflicts": [...], + "License": [...] + } + } + + The caller can then use this data to populate a dataset of packages. + + output = produce_base_output_data() + data = get_extended_dict(query) + for i in range(len(output)): + package_id = output[i].get("ID") + output[i].update(data.get(package_id)) + """ + + conn = aurweb.db.Connection() + + cursor = conn.execute(query) + + data = defaultdict(lambda: defaultdict(list)) + + for result in cursor.fetchall(): + + pkgid = result[0] + key = TYPE_MAP.get(result[1]) + output = result[2] + if result[3]: + output += result[3] + + # In all cases, we have at least an empty License list. + if "License" not in data[pkgid]: + data[pkgid]["License"] = [] + + # In all cases, we have at least an empty Keywords list. + if "Keywords" not in data[pkgid]: + data[pkgid]["Keywords"] = [] + + if "Groups" not in data[pkgid]: + data[pkgid]["Groups"] = [] + + data[pkgid][key].append(output) + + conn.close() + return data + + +def get_limited_extended_fields(): + # Returns: [ID, Type, Name, Cond] + query = """ + SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type, + Groups.Name, '' AS Cond + FROM Groups INNER JOIN PackageGroups + ON PackageGroups.GroupID = Groups.ID + UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type, + Licenses.Name, '' as Cond + FROM Licenses INNER JOIN PackageLicenses + ON PackageLicenses.LicenseID = Licenses.ID + UNION SELECT Packages.ID AS ID, 'Keywords' AS Type, + PackageKeywords.Keyword AS Name, '' as Cond + FROM PackageKeywords + INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID + """ + return get_extended_dict(query) + + +def get_extended_fields(): + # Returns: [ID, Type, Name, Cond] + query = """ + SELECT PackageDepends.PackageID AS ID, DependencyTypes.Name AS Type, + PackageDepends.DepName AS Name, PackageDepends.DepCondition AS Cond + FROM PackageDepends + LEFT JOIN DependencyTypes + ON DependencyTypes.ID = PackageDepends.DepTypeID + UNION SELECT PackageRelations.PackageID AS ID, RelationTypes.Name AS Type, + PackageRelations.RelName AS Name, + PackageRelations.RelCondition AS Cond + FROM PackageRelations + LEFT JOIN RelationTypes + ON RelationTypes.ID = PackageRelations.RelTypeID + UNION SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type, + Groups.Name, '' AS Cond + FROM Groups + INNER JOIN PackageGroups ON PackageGroups.GroupID = Groups.ID + UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type, + Licenses.Name, '' as Cond + FROM Licenses + INNER JOIN PackageLicenses ON PackageLicenses.LicenseID = Licenses.ID + UNION SELECT Packages.ID AS ID, 'Keywords' AS Type, + PackageKeywords.Keyword AS Name, '' as Cond + FROM PackageKeywords + INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID + """ + return get_extended_dict(query) + + +EXTENDED_FIELD_HANDLERS = { + "--extended": get_limited_extended_fields, + "--all": get_extended_fields +} + + +def is_decimal(column): + """ Check if an SQL column is of decimal.Decimal type. """ + if isinstance(column, Decimal): + return float(column) + return column + + +def write_archive(archive: str, output: list): + with gzip.open(archive, "wb") as f: + f.write(b"[\n") + for i, item in enumerate(output): + f.write(orjson.dumps(item)) + if i < len(output) - 1: + f.write(b",") + f.write(b"\n") + f.write(b"]") + + def main(): conn = aurweb.db.Connection() @@ -64,47 +224,52 @@ def main(): updated, update_time = should_update(packages_state, "Packages") if not updated: print("Updating Packages...") - columns = ("Packages.ID, PackageBaseID, Packages.Name, " - "Version, Description, URL") + + # Query columns; copied from RPC. + columns = ("Packages.ID, Packages.Name, " + "PackageBases.ID AS PackageBaseID, " + "PackageBases.Name AS PackageBase, " + "Version, Description, URL, NumVotes, " + "Popularity, OutOfDateTS AS OutOfDate, " + "Users.UserName AS Maintainer, " + "SubmittedTS AS FirstSubmitted, " + "ModifiedTS AS LastModified") + + # Perform query. cur = conn.execute(f"SELECT {columns} FROM Packages " - "INNER JOIN PackageBases " + "LEFT JOIN PackageBases " "ON PackageBases.ID = Packages.PackageBaseID " + "LEFT JOIN Users " + "ON PackageBases.MaintainerUID = Users.ID " "WHERE PackageBases.PackagerUID IS NOT NULL") - # Store JSON-data in `output`, which can be reused for the - # more basic packagesfile generation afterward. - output = dict() - with gzip.open(packagesmetafile, "wt") as f: - """ The output "data" json key points to a list of dictionaries, - each representing a single result, filled with column names as - keys and column values as values. - - Example: - { - "data": [ - { - "ID": 123, - "Name": "package_name", - "PackageBaseID": 234, - "Version": "0.1.1", - "Description": "Some description...", - "URL": "https://some.url" - }, - ... - ] - } - """ - output = [{ - column[0]: result[i] + # Produce packages-meta-v1.json.gz + output = list() + snapshot_uri = aurweb.config.get("options", "snapshot_uri") + for result in cur.fetchall(): + item = { + column[0]: is_decimal(result[i]) for i, column in enumerate(cur.description) - } for result in cur.fetchall()] - json.dump({ - "warning": ("This is a experimental! It can be removed " - "or modified without warning!"), - "data": output - }, f) - - with gzip.open(packagesfile, "w") as f: + } + item["URLPath"] = snapshot_uri % item.get("Name") + output.append(item) + + write_archive(packagesmetafile, output) + + # Produce packages-meta-ext-v1.json.gz + if len(sys.argv) > 1 and sys.argv[1] in EXTENDED_FIELD_HANDLERS: + f = EXTENDED_FIELD_HANDLERS.get(sys.argv[1]) + data = f() + + default_ = {"Groups": [], "License": [], "Keywords": []} + for i in range(len(output)): + data_ = data.get(output[i].get("ID"), default_) + output[i].update(data_) + + write_archive(packagesmetaextfile, output) + + # Produce packages.gz + with gzip.open(packagesfile, "wb") as f: f.write(bytes(pkglist_header + "\n", "UTF-8")) f.writelines([ bytes(x.get("Name") + "\n", "UTF-8") @@ -118,6 +283,7 @@ def main(): updated, update_time = should_update(pkgbases_state, "PackageBases") if not updated: print("Updating PackageBases...") + # Produce pkgbase.gz with gzip.open(pkgbasefile, "w") as f: f.write(bytes(pkgbaselist_header + "\n", "UTF-8")) cur = conn.execute("SELECT Name FROM PackageBases " + @@ -130,6 +296,7 @@ def main(): updated, update_time = should_update(users_state, "Users") if not updated: print("Updating Users...") + # Produce users.gz with gzip.open(userfile, "w") as f: f.write(bytes(userlist_header + "\n", "UTF-8")) cur = conn.execute("SELECT UserName FROM Users") diff --git a/conf/config.defaults b/conf/config.defaults index 36ea02efb..a04f21bcc 100644 --- a/conf/config.defaults +++ b/conf/config.defaults @@ -93,5 +93,6 @@ server = ftp://mirrors.kernel.org/archlinux/%s/os/x86_64 [mkpkglists] packagesfile = /srv/http/aurweb/web/html/packages.gz packagesmetafile = /srv/http/aurweb/web/html/packages-meta-v1.json.gz +packagesmetaextfile = /srv/http/aurweb/web/html/packages-meta-ext-v1.json.gz pkgbasefile = /srv/http/aurweb/web/html/pkgbase.gz userfile = /srv/http/aurweb/web/html/users.gz diff --git a/test/setup.sh b/test/setup.sh index 24bb5f48a..f74cd1b7d 100644 --- a/test/setup.sh +++ b/test/setup.sh @@ -31,6 +31,7 @@ enable-maintenance = 0 maintenance-exceptions = 127.0.0.1 commit_uri = https://aur.archlinux.org/cgit/aur.git/log/?h=%s&id=%s localedir = $TOPLEVEL/web/locale/ +snapshot_uri = /cgit/aur.git/snapshot/%s.tar.gz [notifications] notify-cmd = $NOTIFY @@ -62,6 +63,7 @@ server = file://$(pwd)/remote/ [mkpkglists] packagesfile = packages.gz packagesmetafile = packages-meta-v1.json.gz +packagesmetaextfile = packages-meta-ext-v1.json.gz pkgbasefile = pkgbase.gz userfile = users.gz EOF -- GitLab