Skip to content
Snippets Groups Projects

mkpkglists improvements

Merged Kevin Morris requested to merge kevr/aurweb:teapot into master
All threads resolved!
Compare and Show latest version
2 files
+ 158
59
Compare changes
  • Side-by-side
  • Inline
Files
2
+ 157
59
#!/usr/bin/env python3
"""
Produces package, package base and user archives for the AUR
database.
Archives:
packages.gz | A line-separated list of package names
packages-meta-v1.json | A type=search RPC-formatted JSON dataset
packages-meta-ext-v1.json | An --extended or --all archive
pkgbase.gz | A line-separated list of package base names
users.gz | A line-separated list of user names
This script takes an optional argument: --extended or --all. Based
on the following, right-hand side fields are added to each item.
--extended | License, Keywords, Groups
--all | License, Keywords, Groups, relations and dependencies
"""
import datetime
import gzip
import json
import os
import sys
from collections import defaultdict
from decimal import Decimal
from typing import Tuple
from typing import List, Tuple
import orjson
import aurweb.config
import aurweb.db
@@ -23,6 +43,7 @@ def state_path(archive: str) -> str:
packagesfile = aurweb.config.get('mkpkglists', 'packagesfile')
packagesmetafile = aurweb.config.get('mkpkglists', 'packagesmetafile')
packagesmetaextfile = aurweb.config.get('mkpkglists', 'packagesmetaextfile')
packages_state = state_path(packagesfile)
pkgbasefile = aurweb.config.get('mkpkglists', 'pkgbasefile')
@@ -64,57 +85,129 @@ TYPE_MAP = {
"conflicts": "Conflicts",
"provides": "Provides",
"replaces": "Replaces",
"groups": "Groups",
"license": "License",
"keyword": "Keywords"
}
def get_extended_fields(package_id: int, pkgbase_id: int):
def get_extended_dict(query: str):
"""
Produce data in the form in a single bulk SQL query:
{
<integer_package_id>: {
"Depends": [...],
"Conflicts": [...],
"License": [...]
}
}
The caller can then use this data to populate a dataset of packages.
output = produce_base_output_data()
data = get_extended_dict(query)
for i in range(len(output)):
package_id = output[i].get("ID")
output[i].update(data.get(package_id))
"""
conn = aurweb.db.Connection()
cursor = conn.execute(query)
data = defaultdict(lambda: defaultdict(list))
for result in cursor.fetchall():
pkgid = result[0]
key = TYPE_MAP.get(result[1])
output = result[2]
if result[3]:
output += result[3]
# In all cases, we have at least an empty License list.
if "License" not in data[pkgid]:
data[pkgid]["License"] = []
# In all cases, we have at least an empty Keywords list.
if "Keywords" not in data[pkgid]:
data[pkgid]["Keywords"] = []
if "Groups" not in data[pkgid]:
data[pkgid]["Groups"] = []
data[pkgid][key].append(output)
conn.close()
return data
def get_limited_extended_fields(package_ids: List[int]):
# Returns: [ID, Type, Name, Cond]
package_ids_ = str(package_ids[0])
for id_ in package_ids[1:]:
package_ids_ += f", {id_}"
package_ids = package_ids_
query = """
SELECT DependencyTypes.Name AS Type,
PackageDepends.DepName AS Name,
PackageDepends.DepCondition AS Cond
SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type,
Groups.Name, '' AS Cond
FROM Groups INNER JOIN PackageGroups
ON PackageGroups.GroupID = Groups.ID
UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type,
Licenses.Name, '' as Cond
FROM Licenses INNER JOIN PackageLicenses
ON PackageLicenses.LicenseID = Licenses.ID
UNION SELECT Packages.ID AS ID, 'Keywords' AS Type,
PackageKeywords.Keyword AS Name, '' as Cond
FROM PackageKeywords
INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID
"""
return get_extended_dict(query)
def get_extended_fields(package_ids: List[int]):
# Returns: [ID, Type, Name, Cond]
package_ids_ = str(package_ids[0])
for id_ in package_ids[1:]:
package_ids_ += f", {id_}"
package_ids = package_ids_
query = """
SELECT PackageDepends.PackageID AS ID, DependencyTypes.Name AS Type,
PackageDepends.DepName AS Name, PackageDepends.DepCondition AS Cond
FROM PackageDepends
LEFT JOIN DependencyTypes
ON DependencyTypes.ID = PackageDepends.DepTypeID
WHERE PackageDepends.PackageID = ?
UNION SELECT RelationTypes.Name AS Type,
PackageRelations.RelName AS Name,
PackageRelations.RelCondition AS Cond
UNION SELECT PackageRelations.PackageID AS ID, RelationTypes.Name AS Type,
PackageRelations.RelName AS Name,
PackageRelations.RelCondition AS Cond
FROM PackageRelations
LEFT JOIN RelationTypes
ON RelationTypes.ID = PackageRelations.RelTypeID
WHERE PackageRelations.PackageID = ?
UNION SELECT 'groups' AS Type, Groups.Name, '' AS Cond
FROM Groups INNER JOIN PackageGroups
ON PackageGroups.PackageID = ?
AND PackageGroups.GroupID = Groups.ID
UNION SELECT 'license' AS Type, Licenses.Name, '' as Cond
FROM Licenses INNER JOIN PackageLicenses
ON PackageLicenses.PackageID = ?
AND PackageLicenses.LicenseID = Licenses.ID
UNION SELECT 'keyword' AS Type, PackageKeywords.Keyword AS Name, '' as Cond
FROM PackageKeywords WHERE PackageBaseID = ?
UNION SELECT PackageGroups.PackageID AS ID, 'Groups' AS Type,
Groups.Name, '' AS Cond
FROM Groups
INNER JOIN PackageGroups ON PackageGroups.GroupID = Groups.ID
UNION SELECT PackageLicenses.PackageID AS ID, 'License' AS Type,
Licenses.Name, '' as Cond
FROM Licenses
INNER JOIN PackageLicenses ON PackageLicenses.LicenseID = Licenses.ID
UNION SELECT Packages.ID AS ID, 'Keywords' AS Type,
PackageKeywords.Keyword AS Name, '' as Cond
FROM PackageKeywords
INNER JOIN Packages ON Packages.PackageBaseID = PackageKeywords.PackageBaseID
"""
return get_extended_dict(query)
conn = aurweb.db.Connection()
args = [package_id] * 4 + [pkgbase_id]
cursor = conn.execute(query, args)
EXTENDED_FIELD_HANDLERS = {
"--extended": get_limited_extended_fields,
"--all": get_extended_fields
}
data = defaultdict(list)
data["License"] = []
data["Keywords"] = []
for result in cursor.fetchall():
key = TYPE_MAP.get(result[0])
output = result[1]
if result[2]:
output += result[2]
data[key].append(output)
conn.close()
return data
def is_decimal(column):
""" Check if an SQL column is of decimal.Decimal type. """
if isinstance(column, Decimal):
return float(column)
return column
def main():
@@ -147,10 +240,13 @@ def main():
"ON PackageBases.MaintainerUID = Users.ID "
"WHERE PackageBases.PackagerUID IS NOT NULL")
# Store JSON-data in `output`, which can be reused for the
# more basic packagesfile generation afterward.
output = dict()
with gzip.open(packagesmetafile, "wt") as f:
# Produce packages-meta-v1.json.gz
output = list()
warning = ("This is a experimental! It can be removed "
"or modified without warning!")
snapshot_uri = aurweb.config.get("options", "snapshot_uri")
with gzip.open(packagesmetafile, "wb") as f:
""" The output "data" json key points to a list of dictionaries,
each representing a single result. All items are produced in
the same format that RPC type=search uses.
@@ -162,31 +258,31 @@ def main():
}
"""
def is_decimal(column):
""" Check if an SQL column is of decimal.Decimal type. """
if isinstance(column, Decimal):
return float(column)
return column
output = []
for result in cur.fetchall():
item = {
column[0]: is_decimal(result[i])
for i, column in enumerate(cur.description)
}
if sys.argv[1] == "--extended":
extended_fields = get_extended_fields(result[0], result[2])
item.update(extended_fields)
item["URLPath"] = snapshot_uri % item.get("Name")
output.append(item)
json.dump({
"warning": ("This is a experimental! It can be removed "
"or modified without warning!"),
"data": output
}, f)
f.write(orjson.dumps({"warning": warning, "data": output}))
# Produce packages-meta-ext-v1.json.gz
if len(sys.argv) > 1 and sys.argv[1] in EXTENDED_FIELD_HANDLERS:
f = EXTENDED_FIELD_HANDLERS.get(sys.argv[1])
data = f([x.get("ID") for x in output])
default_ = {"Groups": [], "License": [], "Keywords": []}
for i in range(len(output)):
data_ = data.get(output[i].get("ID"), default_)
output[i].update(data_)
with gzip.open(packagesmetaextfile, "wb") as f:
f.write(orjson.dumps({"warning": warning, "data": output}))
with gzip.open(packagesfile, "w") as f:
# Produce packages.gz
with gzip.open(packagesfile, "wb") as f:
f.write(bytes(pkglist_header + "\n", "UTF-8"))
f.writelines([
bytes(x.get("Name") + "\n", "UTF-8")
@@ -200,6 +296,7 @@ def main():
updated, update_time = should_update(pkgbases_state, "PackageBases")
if not updated:
print("Updating PackageBases...")
# Produce pkgbase.gz
with gzip.open(pkgbasefile, "w") as f:
f.write(bytes(pkgbaselist_header + "\n", "UTF-8"))
cur = conn.execute("SELECT Name FROM PackageBases " +
@@ -212,6 +309,7 @@ def main():
updated, update_time = should_update(users_state, "Users")
if not updated:
print("Updating Users...")
# Produce users.gz
with gzip.open(userfile, "w") as f:
f.write(bytes(userlist_header + "\n", "UTF-8"))
cur = conn.execute("SELECT UserName FROM Users")
Loading