gendummydata.py 10.5 KB
Newer Older
1
#!/usr/bin/env python3
eric's avatar
eric committed
2
3
4
"""
usage: gendummydata.py outputfilename.sql
"""
eric's avatar
eric committed
5
6
7
8
9
10
11
#
# This script seeds the AUR database with dummy data for
# use during development/testing.  It uses random entries
# from /usr/share/dict/words to create user accounts and
# package names.  It generates the SQL statements to
# insert these users/packages into the AUR database.
#
12
import hashlib
13
import logging
14
import os
15
import random
16
import sys
17
import time
18

19
LOG_LEVEL = logging.DEBUG  # logging level. set to logging.INFO to reduce output
eric's avatar
eric committed
20
SEED_FILE = "/usr/share/dict/words"
21
22
USER_ID = 5            # Users.ID of first bogus user
PKG_ID = 1             # Packages.ID of first package
Loui Chang's avatar
Loui Chang committed
23
MAX_USERS = 300        # how many users to 'register'
24
25
26
27
28
29
MAX_DEVS = .1          # what percentage of MAX_USERS are Developers
MAX_TUS = .2           # what percentage of MAX_USERS are Trusted Users
MAX_PKGS = 900         # how many packages to load
PKG_DEPS = (1, 15)     # min/max depends a package has
PKG_RELS = (1, 5)      # min/max relations a package has
PKG_SRC = (1, 3)       # min/max sources a package has
eric's avatar
eric committed
30
PKG_CMNTS = (1, 5)     # min/max number of comments a package has
31
CATEGORIES_COUNT = 17  # the number of categories from aur-schema
32
33
34
VOTING = (0, .30)      # percentage range for package voting
OPEN_PROPOSALS = 5     # number of open trusted user proposals
CLOSE_PROPOSALS = 15   # number of closed trusted user proposals
eric's avatar
eric committed
35
36
37
RANDOM_TLDS = ("edu", "com", "org", "net", "tw", "ru", "pl", "de", "es")
RANDOM_URL = ("http://www.", "ftp://ftp.", "http://", "ftp://")
RANDOM_LOCS = ("pub", "release", "files", "downloads", "src")
38
FORTUNE_FILE = "/usr/share/fortune/cookie"
39

40
41
42
43
# setup logging
logformat = "%(levelname)s: %(message)s"
logging.basicConfig(format=logformat, level=LOG_LEVEL)
log = logging.getLogger()
eric's avatar
eric committed
44
45

if len(sys.argv) != 2:
46
47
    log.error("Missing output filename argument")
    raise SystemExit
eric's avatar
eric committed
48
49
50
51

# make sure the seed file exists
#
if not os.path.exists(SEED_FILE):
52
53
    log.error("Please install the 'words' Arch package")
    raise SystemExit
eric's avatar
eric committed
54

55
56
# make sure comments can be created
#
57
if not os.path.exists(FORTUNE_FILE):
58
59
    log.error("Please install the 'fortune-mod' Arch package")
    raise SystemExit
60

eric's avatar
eric committed
61
62
63
64
65
66
# track what users/package names have been used
#
seen_users = {}
seen_pkgs = {}
user_keys = []

67

eric's avatar
eric committed
68
69
# some functions to generate random data
#
70
def genVersion():
71
72
73
74
75
76
77
78
    ver = []
    ver.append("%d" % random.randrange(0, 10))
    ver.append("%d" % random.randrange(0, 20))
    if random.randrange(0, 2) == 0:
        ver.append("%d" % random.randrange(0, 100))
    return ".".join(ver) + "-%d" % random.randrange(1, 11)


eric's avatar
eric committed
79
def genCategory():
80
81
82
    return random.randrange(1, CATEGORIES_COUNT)


eric's avatar
eric committed
83
def genUID():
84
85
86
    return seen_users[user_keys[random.randrange(0, len(user_keys))]]


87
def genFortune():
88
    return fortunes[random.randrange(0, len(fortunes))].replace("'", "")
eric's avatar
eric committed
89
90
91
92


# load the words, and make sure there are enough words for users/pkgs
#
93
log.debug("Grabbing words from seed file...")
94
fp = open(SEED_FILE, "r", encoding="utf-8")
eric's avatar
eric committed
95
96
97
contents = fp.readlines()
fp.close()
if MAX_USERS > len(contents):
98
    MAX_USERS = len(contents)
eric's avatar
eric committed
99
if MAX_PKGS > len(contents):
100
    MAX_PKGS = len(contents)
eric's avatar
eric committed
101
if len(contents) - MAX_USERS > MAX_PKGS:
102
    need_dupes = 0
eric's avatar
eric committed
103
else:
104
    need_dupes = 1
eric's avatar
eric committed
105
106
107

# select random usernames
#
108
log.debug("Generating random user names...")
eric's avatar
eric committed
109
110
user_id = USER_ID
while len(seen_users) < MAX_USERS:
111
112
113
114
115
116
    user = random.randrange(0, len(contents))
    word = contents[user].replace("'", "").replace(".", "").replace(" ", "_")
    word = word.strip().lower()
    if word not in seen_users:
        seen_users[word] = user_id
        user_id += 1
117
user_keys = list(seen_users.keys())
eric's avatar
eric committed
118
119
120

# select random package names
#
121
log.debug("Generating random package names...")
eric's avatar
eric committed
122
123
num_pkgs = PKG_ID
while len(seen_pkgs) < MAX_PKGS:
124
125
126
127
128
129
130
131
132
133
134
    pkg = random.randrange(0, len(contents))
    word = contents[pkg].replace("'", "").replace(".", "").replace(" ", "_")
    word = word.strip().lower()
    if not need_dupes:
        if word not in seen_pkgs and word not in seen_users:
            seen_pkgs[word] = num_pkgs
            num_pkgs += 1
    else:
        if word not in seen_pkgs:
            seen_pkgs[word] = num_pkgs
            num_pkgs += 1
eric's avatar
eric committed
135

eric's avatar
eric committed
136
137
138
139
# free up contents memory
#
contents = None

140
141
142
143
144
145
146
# developer/tu IDs
#
developers = []
trustedusers = []
has_devs = 0
has_tus = 0

147
148
# Just let python throw the errors if any happen
#
149
out = open(sys.argv[1], "w", encoding="utf-8")
150
151
out.write("BEGIN;\n")

eric's avatar
eric committed
152
153
# Begin by creating the User statements
#
154
log.debug("Creating SQL statements for users.")
eric's avatar
eric committed
155
for u in user_keys:
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
    account_type = 1  # default to normal user
    if not has_devs or not has_tus:
        account_type = random.randrange(1, 4)
        if account_type == 3 and not has_devs:
            # this will be a dev account
            #
            developers.append(seen_users[u])
            if len(developers) >= MAX_DEVS * MAX_USERS:
                has_devs = 1
        elif account_type == 2 and not has_tus:
            # this will be a trusted user account
            #
            trustedusers.append(seen_users[u])
            if len(trustedusers) >= MAX_TUS * MAX_USERS:
                has_tus = 1
        else:
            # a normal user account
            #
            pass

    h = hashlib.new('md5')
    h.update(u.encode())
    s = ("INSERT INTO Users (ID, AccountTypeID, Username, Email, Passwd)"
         " VALUES (%d, %d, '%s', '%s@example.com', '%s');\n")
    s = s % (seen_users[u], account_type, u, u, h.hexdigest())
    out.write(s)
182
183
184
185
186

log.debug("Number of developers: %d" % len(developers))
log.debug("Number of trusted users: %d" % len(trustedusers))
log.debug("Number of users: %d" % (MAX_USERS-len(developers)-len(trustedusers)))
log.debug("Number of packages: %d" % MAX_PKGS)
eric's avatar
eric committed
187

188
log.debug("Gathering text from fortune file...")
189
fp = open(FORTUNE_FILE, "r", encoding="utf-8")
190
191
192
fortunes = fp.read().split("%\n")
fp.close()

eric's avatar
eric committed
193
194
# Create the package statements
#
195
log.debug("Creating SQL statements for packages.")
eric's avatar
eric committed
196
count = 0
197
for p in list(seen_pkgs.keys()):
198
199
200
201
202
203
204
205
206
207
208
209
210
    NOW = int(time.time())
    if count % 2 == 0:
        muid = developers[random.randrange(0, len(developers))]
        puid = developers[random.randrange(0, len(developers))]
    else:
        muid = trustedusers[random.randrange(0, len(trustedusers))]
        puid = trustedusers[random.randrange(0, len(trustedusers))]
    if count % 20 == 0:  # every so often, there are orphans...
        muid = "NULL"

    uuid = genUID()  # the submitter/user

    s = ("INSERT INTO PackageBases (ID, Name, FlaggerComment, SubmittedTS, ModifiedTS, "
211
         "SubmitterUID, MaintainerUID, PackagerUID) VALUES (%d, '%s', '', %d, %d, %d, %s, %s);\n")
212
213
    s = s % (seen_pkgs[p], p, NOW, NOW, uuid, muid, puid)
    out.write(s)
214

215
    s = ("INSERT INTO Packages (ID, PackageBaseID, Name, Version) VALUES "
216
         "(%d, %d, '%s', '%s');\n")
217
218
    s = s % (seen_pkgs[p], seen_pkgs[p], p, genVersion())
    out.write(s)
219

220
    count += 1
eric's avatar
eric committed
221

222
223
224
225
226
227
228
229
230
    # create random comments for this package
    #
    num_comments = random.randrange(PKG_CMNTS[0], PKG_CMNTS[1])
    for i in range(0, num_comments):
        now = NOW + random.randrange(400, 86400*3)
        s = ("INSERT INTO PackageComments (PackageBaseID, UsersID,"
             " Comments, RenderedComment, CommentTS) VALUES (%d, %d, '%s', '', %d);\n")
        s = s % (seen_pkgs[p], genUID(), genFortune(), now)
        out.write(s)
eric's avatar
eric committed
231

eric's avatar
eric committed
232
233
# Cast votes
#
234
track_votes = {}
235
log.debug("Casting votes for packages.")
eric's avatar
eric committed
236
for u in user_keys:
237
238
239
240
241
242
243
244
245
246
247
248
249
250
    num_votes = random.randrange(int(len(seen_pkgs)*VOTING[0]),
                                 int(len(seen_pkgs)*VOTING[1]))
    pkgvote = {}
    for v in range(num_votes):
        pkg = random.randrange(1, len(seen_pkgs) + 1)
        if pkg not in pkgvote:
            s = ("INSERT INTO PackageVotes (UsersID, PackageBaseID)"
                 " VALUES (%d, %d);\n")
            s = s % (seen_users[u], pkg)
            pkgvote[pkg] = 1
            if pkg not in track_votes:
                track_votes[pkg] = 0
            track_votes[pkg] += 1
            out.write(s)
eric's avatar
eric committed
251

252
253
# Update statements for package votes
#
254
for p in list(track_votes.keys()):
255
256
257
    s = "UPDATE PackageBases SET NumVotes = %d WHERE ID = %d;\n"
    s = s % (track_votes[p], p)
    out.write(s)
258

259
260
# Create package dependencies and sources
#
261
log.debug("Creating statements for package depends/sources.")
262
for p in list(seen_pkgs.keys()):
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
    num_deps = random.randrange(PKG_DEPS[0], PKG_DEPS[1])
    for i in range(0, num_deps):
        dep = random.choice([k for k in seen_pkgs])
        deptype = random.randrange(1, 5)
        if deptype == 4:
            dep += ": for " + random.choice([k for k in seen_pkgs])
        s = "INSERT INTO PackageDepends(PackageID, DepTypeID, DepName) VALUES (%d, %d, '%s');\n"
        s = s % (seen_pkgs[p], deptype, dep)
        out.write(s)

    num_rels = random.randrange(PKG_RELS[0], PKG_RELS[1])
    for i in range(0, num_deps):
        rel = random.choice([k for k in seen_pkgs])
        reltype = random.randrange(1, 4)
        s = "INSERT INTO PackageRelations(PackageID, RelTypeID, RelName) VALUES (%d, %d, '%s');\n"
        s = s % (seen_pkgs[p], reltype, rel)
        out.write(s)

    num_sources = random.randrange(PKG_SRC[0], PKG_SRC[1])
    for i in range(num_sources):
        src_file = user_keys[random.randrange(0, len(user_keys))]
        src = "%s%s.%s/%s/%s-%s.tar.gz" % (
                RANDOM_URL[random.randrange(0, len(RANDOM_URL))],
                p, RANDOM_TLDS[random.randrange(0, len(RANDOM_TLDS))],
                RANDOM_LOCS[random.randrange(0, len(RANDOM_LOCS))],
                src_file, genVersion())
        s = "INSERT INTO PackageSources(PackageID, Source) VALUES (%d, '%s');\n"
        s = s % (seen_pkgs[p], src)
        out.write(s)
292

293
294
295
# Create trusted user proposals
#
log.debug("Creating SQL statements for trusted user proposals.")
296
count = 0
297
for t in range(0, OPEN_PROPOSALS+CLOSE_PROPOSALS):
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
    now = int(time.time())
    if count < CLOSE_PROPOSALS:
        start = now - random.randrange(3600*24*7, 3600*24*21)
        end = now - random.randrange(0, 3600*24*7)
    else:
        start = now
        end = now + random.randrange(3600*24, 3600*24*7)
    if count % 5 == 0:  # Don't make the vote about anyone once in a while
        user = ""
    else:
        user = user_keys[random.randrange(0, len(user_keys))]
    suid = trustedusers[random.randrange(0, len(trustedusers))]
    s = ("INSERT INTO TU_VoteInfo (Agenda, User, Submitted, End,"
         " Quorum, SubmitterID) VALUES ('%s', '%s', %d, %d, 0.0, %d);\n")
    s = s % (genFortune(), user, start, end, suid)
    out.write(s)
    count += 1
315

eric's avatar
eric committed
316
317
# close output file
#
318
out.write("COMMIT;\n")
eric's avatar
eric committed
319
320
out.write("\n")
out.close()
321
log.debug("Done.")