Skip to content
Snippets Groups Projects
Verified Commit e5cd3e1e authored by Kevin Morris's avatar Kevin Morris
Browse files

fix(mkpkglists): add all RPC type=search fields to packages-meta-v1


The SQL logic in this file for package metadata now exactly
reflects RPC's search logic, without searching for specific
packages.

Two command line arguments are available:

    --extended | Include License, Keywords and Groups fields
    --all      | Include License, Keywords, Groups, dependencies and
                 relations

When one of these arguments are passed, the script will create
a packages-meta-ext-v1.json.gz, configured via packagesmetaextfile.

Done this way so that there are multiple options for archive;
the roughly 2.4MB basic metadata (-meta-v1) and the
~4-7MB (--extended) / 9.8MB (--all) metadata (-meta-ext-v1).

Archive JSON is in the following format: line-separated package objects
enclosed in a list:

    [
    {...},
    {...},
    {...}
    ]

This allows users to stream the archive and process packages
line-by-line, during the local storage process.

Signed-off-by: Kevin Morris's avatarKevin Morris <kevr@0cost.org>
parent d33845e8
No related branches found
No related tags found
No related merge requests found
This commit is part of merge request !233. Comments created here will be created in the context of that merge request.
...@@ -12,7 +12,7 @@ before_script: ...@@ -12,7 +12,7 @@ before_script:
python-pygit2 python-srcinfo python-bleach python-markdown python-pygit2 python-srcinfo python-bleach python-markdown
python-sqlalchemy python-alembic python-pytest python-werkzeug python-sqlalchemy python-alembic python-pytest python-werkzeug
python-pytest-tap python-fastapi hypercorn nginx python-authlib python-pytest-tap python-fastapi hypercorn nginx python-authlib
python-itsdangerous python-httpx python-itsdangerous python-httpx python-orjson
test: test:
script: script:
......
...@@ -50,7 +50,7 @@ read the instructions below. ...@@ -50,7 +50,7 @@ read the instructions below.
# pacman -S python-mysql-connector python-pygit2 python-srcinfo python-sqlalchemy \ # pacman -S python-mysql-connector python-pygit2 python-srcinfo python-sqlalchemy \
python-bleach python-markdown python-alembic python-jinja \ python-bleach python-markdown python-alembic python-jinja \
python-itsdangerous python-authlib python-httpx hypercorn \ python-itsdangerous python-authlib python-httpx hypercorn \
composer composer python-orjson
# python3 setup.py install # python3 setup.py install
4a) Install `composer` dependencies while inside of aurweb's root: 4a) Install `composer` dependencies while inside of aurweb's root:
......
#!/usr/bin/env python3 #!/usr/bin/env python3
"""
Produces package, package base and user archives for the AUR
database.
Archives:
packages.gz | A line-separated list of package names
packages-meta-v1.json | A type=search RPC-formatted JSON dataset
packages-meta-ext-v1.json | An --extended or --all archive
pkgbase.gz | A line-separated list of package base names
users.gz | A line-separated list of user names
This script takes an optional argument: --extended or --all. Based
on the following, right-hand side fields are added to each item.
--extended | License, Keywords, Groups
--all | License, Keywords, Groups, relations and dependencies
"""
import datetime import datetime
import gzip import gzip
import json
import os import os
import sys
from collections import defaultdict
from decimal import Decimal
from typing import Tuple from typing import Tuple
import orjson
import aurweb.config import aurweb.config
import aurweb.db import aurweb.db
...@@ -20,6 +43,7 @@ def state_path(archive: str) -> str: ...@@ -20,6 +43,7 @@ def state_path(archive: str) -> str:
packagesfile = aurweb.config.get('mkpkglists', 'packagesfile') packagesfile = aurweb.config.get('mkpkglists', 'packagesfile')
packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile') packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile')
packagesmetaextfile = aurweb.config.get('mkpkglists', 'packagesmetaextfile')
packages_state = state_path(packagesfile) packages_state = state_path(packagesfile)
pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile') pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile')
...@@ -53,6 +77,142 @@ def update_state(state: str, update_time: int) -> None: ...@@ -53,6 +77,142 @@ def update_state(state: str, update_time: int) -> None:
f.write(str(update_time)) f.write(str(update_time))
TYPE_MAP = {
"depends": "Depends",
"makedepends": "MakeDepends",
"checkdepends": "CheckDepends",
"optdepends": "OptDepends",
"conflicts": "Conflicts",
"provides": "Provides",
"replaces": "Replaces",
}
def get_extended_dict(query: str):
"""
Produce data in the form in a single bulk SQL query:
{
<integer_package_id>: {
"Depends": [...],
"Conflicts": [...],
"License": [...]
}
}
The caller can then use this data to populate a dataset of packages.
output = produce_base_output_data()
data = get_extended_dict(query)
for i in range(len(output)):
package_id = output[i].get("ID")
output[i].update(data.get(package_id))
"""
conn = aurweb.db.Connection()
cursor = conn.execute(query)
data = defaultdict(lambda: defaultdict(list))
for result in cursor.fetchall():
pkgid = result[0]
key = TYPE_MAP.get(result[1])
output = result[2]
if result[3]:
output += result[3]
# In all cases, we have at least an empty License list.
if "License" not in data[pkgid]:
data[pkgid]["License"] = []
# In all cases, we have at least an empty Keywords list.
if "Keywords" not in data[pkgid]:
data[pkgid]["Keywords"] = []
if "Groups" not in data[pkgid]:
data[pkgid]["Groups"] = []
data[pkgid][key].append(output)
conn.close()
return data
def get_limited_extended_fields():
# Returns: [ID, Type, Name, Cond]
query = """
SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type,
Groups.Name, '' AS Cond
FROM Groups INNER JOIN PackageGroups
ON PackageGroups.GroupID = Groups.ID
UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type,
Licenses.Name, '' as Cond
FROM Licenses INNER JOIN PackageLicenses
ON PackageLicenses.LicenseID = Licenses.ID
UNION SELECT Packages.ID AS ID, 'Keywords' AS Type,
PackageKeywords.Keyword AS Name, '' as Cond
FROM PackageKeywords
INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID
"""
return get_extended_dict(query)
def get_extended_fields():
# Returns: [ID, Type, Name, Cond]
query = """
SELECT PackageDepends.PackageID AS ID, DependencyTypes.Name AS Type,
PackageDepends.DepName AS Name, PackageDepends.DepCondition AS Cond
FROM PackageDepends
LEFT JOIN DependencyTypes
ON DependencyTypes.ID = PackageDepends.DepTypeID
UNION SELECT PackageRelations.PackageID AS ID, RelationTypes.Name AS Type,
PackageRelations.RelName AS Name,
PackageRelations.RelCondition AS Cond
FROM PackageRelations
LEFT JOIN RelationTypes
ON RelationTypes.ID = PackageRelations.RelTypeID
UNION SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type,
Groups.Name, '' AS Cond
FROM Groups
INNER JOIN PackageGroups ON PackageGroups.GroupID = Groups.ID
UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type,
Licenses.Name, '' as Cond
FROM Licenses
INNER JOIN PackageLicenses ON PackageLicenses.LicenseID = Licenses.ID
UNION SELECT Packages.ID AS ID, 'Keywords' AS Type,
PackageKeywords.Keyword AS Name, '' as Cond
FROM PackageKeywords
INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID
"""
return get_extended_dict(query)
EXTENDED_FIELD_HANDLERS = {
"--extended": get_limited_extended_fields,
"--all": get_extended_fields
}
def is_decimal(column):
""" Check if an SQL column is of decimal.Decimal type. """
if isinstance(column, Decimal):
return float(column)
return column
def write_archive(archive: str, output: list):
with gzip.open(archive, "wb") as f:
f.write(b"[\n")
for i, item in enumerate(output):
f.write(orjson.dumps(item))
if i < len(output) - 1:
f.write(b",")
f.write(b"\n")
f.write(b"]")
def main(): def main():
conn = aurweb.db.Connection() conn = aurweb.db.Connection()
...@@ -64,47 +224,52 @@ def main(): ...@@ -64,47 +224,52 @@ def main():
updated, update_time = should_update(packages_state, "Packages") updated, update_time = should_update(packages_state, "Packages")
if not updated: if not updated:
print("Updating Packages...") print("Updating Packages...")
columns = ("Packages.ID, PackageBaseID, Packages.Name, "
"Version, Description, URL") # Query columns; copied from RPC.
columns = ("Packages.ID, Packages.Name, "
"PackageBases.ID AS PackageBaseID, "
"PackageBases.Name AS PackageBase, "
"Version, Description, URL, NumVotes, "
"Popularity, OutOfDateTS AS OutOfDate, "
"Users.UserName AS Maintainer, "
"SubmittedTS AS FirstSubmitted, "
"ModifiedTS AS LastModified")
# Perform query.
cur = conn.execute(f"SELECT {columns} FROM Packages " cur = conn.execute(f"SELECT {columns} FROM Packages "
"INNER JOIN PackageBases " "LEFT JOIN PackageBases "
"ON PackageBases.ID = Packages.PackageBaseID " "ON PackageBases.ID = Packages.PackageBaseID "
"LEFT JOIN Users "
"ON PackageBases.MaintainerUID = Users.ID "
"WHERE PackageBases.PackagerUID IS NOT NULL") "WHERE PackageBases.PackagerUID IS NOT NULL")
# Store JSON-data in `output`, which can be reused for the # Produce packages-meta-v1.json.gz
# more basic packagesfile generation afterward. output = list()
output = dict() snapshot_uri = aurweb.config.get("options", "snapshot_uri")
with gzip.open(packagesmetafile, "wt") as f: for result in cur.fetchall():
""" The output "data" json key points to a list of dictionaries, item = {
each representing a single result, filled with column names as column[0]: is_decimal(result[i])
keys and column values as values.
Example:
{
"data": [
{
"ID": 123,
"Name": "package_name",
"PackageBaseID": 234,
"Version": "0.1.1",
"Description": "Some description...",
"URL": "https://some.url"
},
...
]
}
"""
output = [{
column[0]: result[i]
for i, column in enumerate(cur.description) for i, column in enumerate(cur.description)
} for result in cur.fetchall()] }
json.dump({ item["URLPath"] = snapshot_uri % item.get("Name")
"warning": ("This is a experimental! It can be removed " output.append(item)
"or modified without warning!"),
"data": output write_archive(packagesmetafile, output)
}, f)
# Produce packages-meta-ext-v1.json.gz
with gzip.open(packagesfile, "w") as f: if len(sys.argv) > 1 and sys.argv[1] in EXTENDED_FIELD_HANDLERS:
f = EXTENDED_FIELD_HANDLERS.get(sys.argv[1])
data = f()
default_ = {"Groups": [], "License": [], "Keywords": []}
for i in range(len(output)):
data_ = data.get(output[i].get("ID"), default_)
output[i].update(data_)
write_archive(packagesmetaextfile, output)
# Produce packages.gz
with gzip.open(packagesfile, "wb") as f:
f.write(bytes(pkglist_header + "\n", "UTF-8")) f.write(bytes(pkglist_header + "\n", "UTF-8"))
f.writelines([ f.writelines([
bytes(x.get("Name") + "\n", "UTF-8") bytes(x.get("Name") + "\n", "UTF-8")
...@@ -118,6 +283,7 @@ def main(): ...@@ -118,6 +283,7 @@ def main():
updated, update_time = should_update(pkgbases_state, "PackageBases") updated, update_time = should_update(pkgbases_state, "PackageBases")
if not updated: if not updated:
print("Updating PackageBases...") print("Updating PackageBases...")
# Produce pkgbase.gz
with gzip.open(pkgbasefile, "w") as f: with gzip.open(pkgbasefile, "w") as f:
f.write(bytes(pkgbaselist_header + "\n", "UTF-8")) f.write(bytes(pkgbaselist_header + "\n", "UTF-8"))
cur = conn.execute("SELECT Name FROM PackageBases " + cur = conn.execute("SELECT Name FROM PackageBases " +
...@@ -130,6 +296,7 @@ def main(): ...@@ -130,6 +296,7 @@ def main():
updated, update_time = should_update(users_state, "Users") updated, update_time = should_update(users_state, "Users")
if not updated: if not updated:
print("Updating Users...") print("Updating Users...")
# Produce users.gz
with gzip.open(userfile, "w") as f: with gzip.open(userfile, "w") as f:
f.write(bytes(userlist_header + "\n", "UTF-8")) f.write(bytes(userlist_header + "\n", "UTF-8"))
cur = conn.execute("SELECT UserName FROM Users") cur = conn.execute("SELECT UserName FROM Users")
......
...@@ -93,5 +93,6 @@ server = ftp://mirrors.kernel.org/archlinux/%s/os/x86_64 ...@@ -93,5 +93,6 @@ server = ftp://mirrors.kernel.org/archlinux/%s/os/x86_64
[mkpkglists] [mkpkglists]
packagesfile = /srv/http/aurweb/web/html/packages.gz packagesfile = /srv/http/aurweb/web/html/packages.gz
packagesmetafile = /srv/http/aurweb/web/html/packages-meta-v1.json.gz packagesmetafile = /srv/http/aurweb/web/html/packages-meta-v1.json.gz
packagesmetaextfile = /srv/http/aurweb/web/html/packages-meta-ext-v1.json.gz
pkgbasefile = /srv/http/aurweb/web/html/pkgbase.gz pkgbasefile = /srv/http/aurweb/web/html/pkgbase.gz
userfile = /srv/http/aurweb/web/html/users.gz userfile = /srv/http/aurweb/web/html/users.gz
...@@ -31,6 +31,7 @@ enable-maintenance = 0 ...@@ -31,6 +31,7 @@ enable-maintenance = 0
maintenance-exceptions = 127.0.0.1 maintenance-exceptions = 127.0.0.1
commit_uri = https://aur.archlinux.org/cgit/aur.git/log/?h=%s&id=%s commit_uri = https://aur.archlinux.org/cgit/aur.git/log/?h=%s&id=%s
localedir = $TOPLEVEL/web/locale/ localedir = $TOPLEVEL/web/locale/
snapshot_uri = /cgit/aur.git/snapshot/%s.tar.gz
[notifications] [notifications]
notify-cmd = $NOTIFY notify-cmd = $NOTIFY
...@@ -62,6 +63,7 @@ server = file://$(pwd)/remote/ ...@@ -62,6 +63,7 @@ server = file://$(pwd)/remote/
[mkpkglists] [mkpkglists]
packagesfile = packages.gz packagesfile = packages.gz
packagesmetafile = packages-meta-v1.json.gz packagesmetafile = packages-meta-v1.json.gz
packagesmetaextfile = packages-meta-ext-v1.json.gz
pkgbasefile = pkgbase.gz pkgbasefile = pkgbase.gz
userfile = users.gz userfile = users.gz
EOF EOF
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment