diff --git a/aurweb/scripts/mkpkglists.py b/aurweb/scripts/mkpkglists.py index 646d086505368ff0c7f4ea56dd54989677cbe6e3..67f4307c459744b3c2fe0931b65ca9672b2a793b 100755 --- a/aurweb/scripts/mkpkglists.py +++ b/aurweb/scripts/mkpkglists.py @@ -1,11 +1,34 @@ #!/usr/bin/env python3 +""" +Produces package, package base and user archives for the AUR +database. + +Archives: + + packages.gz | A line-separated list of package names + packages-meta-v1.json | A type=search RPC-formatted JSON dataset + packages-meta-ext-v1.json | An --extended or --all archive + pkgbase.gz | A line-separated list of package base names + users.gz | A line-separated list of user names + +This script takes an optional argument: --extended or --all. Based +on the following, right-hand side fields are added to each item. + + --extended | License, Keywords, Groups + --all | License, Keywords, Groups, relations and dependencies + +""" import datetime import gzip -import json import os +import sys + +from collections import defaultdict +from decimal import Decimal +from typing import List, Tuple -from typing import Tuple +import orjson import aurweb.config import aurweb.db @@ -20,6 +43,7 @@ def state_path(archive: str) -> str: packagesfile = aurweb.config.get('mkpkglists', 'packagesfile') packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile') +packagesmetaextfile = aurweb.config.get('mkpkglists', 'packagesmetaextfile') packages_state = state_path(packagesfile) pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile') @@ -53,6 +77,139 @@ def update_state(state: str, update_time: int) -> None: f.write(str(update_time)) +TYPE_MAP = { + "depends": "Depends", + "makedepends": "MakeDepends", + "checkdepends": "CheckDepends", + "optdepends": "OptDepends", + "conflicts": "Conflicts", + "provides": "Provides", + "replaces": "Replaces", +} + + +def get_extended_dict(query: str): + """ + Produce data in the form in a single bulk SQL query: + + { + <integer_package_id>: { + "Depends": [...], + "Conflicts": [...], + "License": [...] + } + } + + The caller can then use this data to populate a dataset of packages. + + output = produce_base_output_data() + data = get_extended_dict(query) + for i in range(len(output)): + package_id = output[i].get("ID") + output[i].update(data.get(package_id)) + """ + + conn = aurweb.db.Connection() + + cursor = conn.execute(query) + + data = defaultdict(lambda: defaultdict(list)) + + for result in cursor.fetchall(): + + pkgid = result[0] + key = TYPE_MAP.get(result[1]) + output = result[2] + if result[3]: + output += result[3] + + # In all cases, we have at least an empty License list. + if "License" not in data[pkgid]: + data[pkgid]["License"] = [] + + # In all cases, we have at least an empty Keywords list. + if "Keywords" not in data[pkgid]: + data[pkgid]["Keywords"] = [] + + if "Groups" not in data[pkgid]: + data[pkgid]["Groups"] = [] + + data[pkgid][key].append(output) + + conn.close() + return data + + +def get_limited_extended_fields(package_ids: List[int]): + # Returns: [ID, Type, Name, Cond] + package_ids_ = str(package_ids[0]) + for id_ in package_ids[1:]: + package_ids_ += f", {id_}" + package_ids = package_ids_ + query = """ + SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type, + Groups.Name, '' AS Cond + FROM Groups INNER JOIN PackageGroups + ON PackageGroups.GroupID = Groups.ID + UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type, + Licenses.Name, '' as Cond + FROM Licenses INNER JOIN PackageLicenses + ON PackageLicenses.LicenseID = Licenses.ID + UNION SELECT Packages.ID AS ID, 'Keywords' AS Type, + PackageKeywords.Keyword AS Name, '' as Cond + FROM PackageKeywords + INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID + """ + return get_extended_dict(query) + + +def get_extended_fields(package_ids: List[int]): + # Returns: [ID, Type, Name, Cond] + package_ids_ = str(package_ids[0]) + for id_ in package_ids[1:]: + package_ids_ += f", {id_}" + package_ids = package_ids_ + query = """ + SELECT PackageDepends.PackageID AS ID, DependencyTypes.Name AS Type, + PackageDepends.DepName AS Name, PackageDepends.DepCondition AS Cond + FROM PackageDepends + LEFT JOIN DependencyTypes + ON DependencyTypes.ID = PackageDepends.DepTypeID + UNION SELECT PackageRelations.PackageID AS ID, RelationTypes.Name AS Type, + PackageRelations.RelName AS Name, + PackageRelations.RelCondition AS Cond + FROM PackageRelations + LEFT JOIN RelationTypes + ON RelationTypes.ID = PackageRelations.RelTypeID + UNION SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type, + Groups.Name, '' AS Cond + FROM Groups + INNER JOIN PackageGroups ON PackageGroups.GroupID = Groups.ID + UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type, + Licenses.Name, '' as Cond + FROM Licenses + INNER JOIN PackageLicenses ON PackageLicenses.LicenseID = Licenses.ID + UNION SELECT Packages.ID AS ID, 'Keywords' AS Type, + PackageKeywords.Keyword AS Name, '' as Cond + FROM PackageKeywords + INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID + """ + return get_extended_dict(query) + + +EXTENDED_FIELD_HANDLERS = { + "--extended": get_limited_extended_fields, + "--all": get_extended_fields +} + + +def is_decimal(column): + """ Check if an SQL column is of decimal.Decimal type. """ + if isinstance(column, Decimal): + return float(column) + return column + + def main(): conn = aurweb.db.Connection() @@ -64,47 +221,68 @@ def main(): updated, update_time = should_update(packages_state, "Packages") if not updated: print("Updating Packages...") - columns = ("Packages.ID, PackageBaseID, Packages.Name, " - "Version, Description, URL") + + # Query columns; copied from RPC. + columns = ("Packages.ID, Packages.Name, " + "PackageBases.ID AS PackageBaseID, " + "PackageBases.Name AS PackageBase, " + "Version, Description, URL, NumVotes, " + "Popularity, OutOfDateTS AS OutOfDate, " + "Users.UserName AS Maintainer, " + "SubmittedTS AS FirstSubmitted, " + "ModifiedTS AS LastModified") + + # Perform query. cur = conn.execute(f"SELECT {columns} FROM Packages " - "INNER JOIN PackageBases " + "LEFT JOIN PackageBases " "ON PackageBases.ID = Packages.PackageBaseID " + "LEFT JOIN Users " + "ON PackageBases.MaintainerUID = Users.ID " "WHERE PackageBases.PackagerUID IS NOT NULL") - # Store JSON-data in `output`, which can be reused for the - # more basic packagesfile generation afterward. - output = dict() - with gzip.open(packagesmetafile, "wt") as f: + # Produce packages-meta-v1.json.gz + output = list() + warning = ("This is a experimental! It can be removed " + "or modified without warning!") + + snapshot_uri = aurweb.config.get("options", "snapshot_uri") + with gzip.open(packagesmetafile, "wb") as f: """ The output "data" json key points to a list of dictionaries, - each representing a single result, filled with column names as - keys and column values as values. + each representing a single result. All items are produced in + the same format that RPC type=search uses. Example: { - "data": [ - { - "ID": 123, - "Name": "package_name", - "PackageBaseID": 234, - "Version": "0.1.1", - "Description": "Some description...", - "URL": "https://some.url" - }, - ... - ] + "warning": "...", + "data": [{...}, {...}, ...] } """ - output = [{ - column[0]: result[i] - for i, column in enumerate(cur.description) - } for result in cur.fetchall()] - json.dump({ - "warning": ("This is a experimental! It can be removed " - "or modified without warning!"), - "data": output - }, f) - - with gzip.open(packagesfile, "w") as f: + + for result in cur.fetchall(): + item = { + column[0]: is_decimal(result[i]) + for i, column in enumerate(cur.description) + } + item["URLPath"] = snapshot_uri % item.get("Name") + output.append(item) + + f.write(orjson.dumps({"warning": warning, "data": output})) + + # Produce packages-meta-ext-v1.json.gz + if len(sys.argv) > 1 and sys.argv[1] in EXTENDED_FIELD_HANDLERS: + f = EXTENDED_FIELD_HANDLERS.get(sys.argv[1]) + data = f([x.get("ID") for x in output]) + + default_ = {"Groups": [], "License": [], "Keywords": []} + for i in range(len(output)): + data_ = data.get(output[i].get("ID"), default_) + output[i].update(data_) + + with gzip.open(packagesmetaextfile, "wb") as f: + f.write(orjson.dumps({"warning": warning, "data": output})) + + # Produce packages.gz + with gzip.open(packagesfile, "wb") as f: f.write(bytes(pkglist_header + "\n", "UTF-8")) f.writelines([ bytes(x.get("Name") + "\n", "UTF-8") @@ -118,6 +296,7 @@ def main(): updated, update_time = should_update(pkgbases_state, "PackageBases") if not updated: print("Updating PackageBases...") + # Produce pkgbase.gz with gzip.open(pkgbasefile, "w") as f: f.write(bytes(pkgbaselist_header + "\n", "UTF-8")) cur = conn.execute("SELECT Name FROM PackageBases " + @@ -130,6 +309,7 @@ def main(): updated, update_time = should_update(users_state, "Users") if not updated: print("Updating Users...") + # Produce users.gz with gzip.open(userfile, "w") as f: f.write(bytes(userlist_header + "\n", "UTF-8")) cur = conn.execute("SELECT UserName FROM Users") diff --git a/conf/config.defaults b/conf/config.defaults index 36ea02efb3cae45b68df7dcb5f61305dfc01aef6..a04f21bcc3dea6d455889eeda92e73903346a6a5 100644 --- a/conf/config.defaults +++ b/conf/config.defaults @@ -93,5 +93,6 @@ server = ftp://mirrors.kernel.org/archlinux/%s/os/x86_64 [mkpkglists] packagesfile = /srv/http/aurweb/web/html/packages.gz packagesmetafile = /srv/http/aurweb/web/html/packages-meta-v1.json.gz +packagesmetaextfile = /srv/http/aurweb/web/html/packages-meta-ext-v1.json.gz pkgbasefile = /srv/http/aurweb/web/html/pkgbase.gz userfile = /srv/http/aurweb/web/html/users.gz