Commit 9112c623 authored by Florian Pritz's avatar Florian Pritz
Browse files

Upload files in chunks of 20 files at a time



Some pacakges have very many files (stack has ~1376) that are currently
uploaded all at once. If there is an exception during this run or if the
program is killed (happened due to 'open files' limit), the sucessfully
uploaded files are not added to the sqlite db. Next time they are
uploaded again and the upload may be interrupted again, leading to huge
time loss.

To fix this, we now upload in chunks of 20 files at a time and add them
to the db after this. Thus, as long as we manage to upload 20 files
without crashing, the next time, there will be fewer files to upload.
Signed-off-by: Florian Pritz's avatarFlorian Pritz <bluewind@xinu.at>
parent 8bebf399
......@@ -4,7 +4,7 @@ import upload_pkg_internetarchive
import DB
import mock
from unittest.mock import MagicMock
from unittest.mock import MagicMock, call
import unittest
class TestUploader(unittest.TestCase):
......@@ -13,11 +13,13 @@ class TestUploader(unittest.TestCase):
mock_uploader = MagicMock()
app = upload_pkg_internetarchive.ArchiveUploader(mock_uploader,
DB.DB(':memory:'))
app.chunksize = 2
response_ok = MagicMock(status_code=200)
mock_uploader.upload.side_effect = [
[response_ok, response_ok, response_ok, response_ok]
[response_ok, response_ok],
[response_ok, response_ok]
]
self.assertFalse(app.db.exists('fb-client-2.0.4-1-any.pkg.tar.xz'))
......@@ -32,23 +34,29 @@ class TestUploader(unittest.TestCase):
app.main(['./test-data/archive/packages/f/fb-client'])
mock_uploader.upload.assert_called_once_with('archlinux_pkg_fb-client',
mock_uploader.upload.assert_has_calls([
call('archlinux_pkg_fb-client',
files=['./test-data/archive/packages/f/fb-client/fb-client-2.0.3-2-any.pkg.tar.xz',
'./test-data/archive/packages/f/fb-client/fb-client-2.0.3-2-any.pkg.tar.xz.sig',
'./test-data/archive/packages/f/fb-client/fb-client-2.0.4-1-any.pkg.tar.xz',
'./test-data/archive/packages/f/fb-client/fb-client-2.0.3-2-any.pkg.tar.xz.sig',],
metadata=mock.ANY),
call('archlinux_pkg_fb-client',
files=['./test-data/archive/packages/f/fb-client/fb-client-2.0.4-1-any.pkg.tar.xz',
'./test-data/archive/packages/f/fb-client/fb-client-2.0.4-1-any.pkg.tar.xz.sig',],
metadata=mock.ANY)
metadata=mock.ANY),
])
def test_upload_pkg_error(self):
mock_uploader = MagicMock()
app = upload_pkg_internetarchive.ArchiveUploader(mock_uploader,
DB.DB(':memory:'))
app.chunksize = 2
response_ok = MagicMock(status_code=200)
response_error = MagicMock(status_code=500)
mock_uploader.upload.side_effect = [
[response_ok, response_ok, response_error, response_ok]
[response_ok, response_ok],
[response_error, response_ok]
]
self.assertFalse(app.db.exists('fb-client-2.0.4-1-any.pkg.tar.xz'))
......@@ -56,12 +64,16 @@ class TestUploader(unittest.TestCase):
app.main(['./test-data/archive/packages/f/fb-client'])
mock_uploader.upload.assert_called_once_with('archlinux_pkg_fb-client',
mock_uploader.upload.assert_has_calls([
call('archlinux_pkg_fb-client',
files=['./test-data/archive/packages/f/fb-client/fb-client-2.0.3-2-any.pkg.tar.xz',
'./test-data/archive/packages/f/fb-client/fb-client-2.0.3-2-any.pkg.tar.xz.sig',
'./test-data/archive/packages/f/fb-client/fb-client-2.0.4-1-any.pkg.tar.xz',
'./test-data/archive/packages/f/fb-client/fb-client-2.0.3-2-any.pkg.tar.xz.sig',],
metadata=mock.ANY),
call('archlinux_pkg_fb-client',
files=['./test-data/archive/packages/f/fb-client/fb-client-2.0.4-1-any.pkg.tar.xz',
'./test-data/archive/packages/f/fb-client/fb-client-2.0.4-1-any.pkg.tar.xz.sig',],
metadata=mock.ANY)
metadata=mock.ANY),
])
self.assertFalse(app.db.exists('fb-client-2.0.4-1-any.pkg.tar.xz'))
self.assertTrue(app.db.exists('fb-client-2.0.3-2-any.pkg.tar.xz'))
......
......@@ -8,6 +8,9 @@ import tarfile
import internetarchive as ia
import DB
# Source: http://stackoverflow.com/a/434328/953022
def chunker(seq, size):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
class ArchiveUploader:
DESCRIPTION = """{pkgdesc}
......@@ -21,6 +24,7 @@ class ArchiveUploader:
def __init__(self, internetarchive = ia, db = DB.DB('archive-uploader.sqlite')):
self.ia = internetarchive
self.db = db
self.chunksize = 20
def clean_name(self, name):
"""Remove chars that are not allowed in an Internet Archive identifier: @.+
......@@ -51,43 +55,44 @@ class ArchiveUploader:
def upload_pkg(self, identifier, pkgname, metadata, directory):
"""Upload all versions for package given by [directory]"""
files = []
all_files = []
for f in os.scandir(directory):
filename = os.path.basename(f.path)
if not self.db.exists(filename):
files.append(f.path)
if not files:
all_files.append(f.path)
if not all_files:
return
# ensure reproducible order for tests
files.sort()
all_files.sort()
# Get last package, to extract a description
last_pkg = sorted(filter(lambda x: not x.endswith('.sig'), files))[-1]
last_pkg = sorted(filter(lambda x: not x.endswith('.sig'), all_files))[-1]
pkginfo = self.extract_pkginfo(last_pkg)
pkgdesc = pkginfo['pkgdesc'] if 'pkgdesc' in pkginfo else ''
metadata['description'] = ArchiveUploader.DESCRIPTION.format(pkgname=pkgname, pkgdesc=pkgdesc, url=pkginfo['url'], license=pkginfo['license'])
metadata['rights'] = 'License: ' + pkginfo['license']
try:
res = self.ia.upload(identifier, files=files, metadata=metadata)
file_status = zip(files, res)
print_error = False
for status in file_status:
f = status[0]
code = status[1].status_code
if code == 200:
filename = os.path.basename(f)
self.db.add_file(filename)
else:
print(f"Upload failed with status code '{code}' for directory '{directory}' and file: {f}", file=sys.stderr)
print_error = True
for files in chunker(all_files, self.chunksize):
try:
res = self.ia.upload(identifier, files=files, metadata=metadata)
file_status = zip(files, res)
print_error = False
for status in file_status:
f = status[0]
code = status[1].status_code
if code == 200:
filename = os.path.basename(f)
self.db.add_file(filename)
else:
print(f"Upload failed with status code '{code}' for directory '{directory}' and file: {f}", file=sys.stderr)
print_error = True
if print_error:
if print_error:
print(directory)
except Exception as e:
print(f"{identifier}: exception raised", file=sys.stderr)
print(e, file=sys.stderr)
print(directory)
except Exception as e:
print(f"{identifier}: exception raised", file=sys.stderr)
print(e, file=sys.stderr)
print(directory)
raise
raise
def main(self, pkg_dirs):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment