gendummydata.py 9.81 KB
Newer Older
1
#!/usr/bin/python3
eric's avatar
eric committed
2
3
4
"""
usage: gendummydata.py outputfilename.sql
"""
eric's avatar
eric committed
5
6
7
8
9
10
11
#
# This script seeds the AUR database with dummy data for
# use during development/testing.  It uses random entries
# from /usr/share/dict/words to create user accounts and
# package names.  It generates the SQL statements to
# insert these users/packages into the AUR database.
#
12
import hashlib
13
14
15
16
import random
import time
import os
import sys
17
import io
18
import logging
19

20
LOG_LEVEL = logging.DEBUG # logging level. set to logging.INFO to reduce output
eric's avatar
eric committed
21
SEED_FILE = "/usr/share/dict/words"
22
23
24
25
DB_HOST   = os.getenv("DB_HOST", "localhost")
DB_NAME   = os.getenv("DB_NAME", "AUR")
DB_USER   = os.getenv("DB_USER", "aur")
DB_PASS   = os.getenv("DB_PASS", "aur")
eric's avatar
eric committed
26
USER_ID   = 5          # Users.ID of first bogus user
eric's avatar
eric committed
27
PKG_ID    = 1          # Packages.ID of first package
Loui Chang's avatar
Loui Chang committed
28
MAX_USERS = 300        # how many users to 'register'
eric's avatar
eric committed
29
30
MAX_DEVS  = .1         # what percentage of MAX_USERS are Developers
MAX_TUS   = .2         # what percentage of MAX_USERS are Trusted Users
Loui Chang's avatar
Loui Chang committed
31
MAX_PKGS  = 900       # how many packages to load
32
PKG_DEPS  = (1, 15)    # min/max depends a package has
33
PKG_RELS  = (1, 5)     # min/max relations a package has
34
PKG_SRC   = (1, 3)     # min/max sources a package has
eric's avatar
eric committed
35
PKG_CMNTS = (1, 5)     # min/max number of comments a package has
36
CATEGORIES_COUNT = 17  # the number of categories from aur-schema
eric's avatar
eric committed
37
VOTING    = (0, .30)   # percentage range for package voting
38
39
OPEN_PROPOSALS = 5 # number of open trusted user proposals
CLOSE_PROPOSALS = 15 # number of closed trusted user proposals
eric's avatar
eric committed
40
41
42
RANDOM_TLDS = ("edu", "com", "org", "net", "tw", "ru", "pl", "de", "es")
RANDOM_URL = ("http://www.", "ftp://ftp.", "http://", "ftp://")
RANDOM_LOCS = ("pub", "release", "files", "downloads", "src")
43
FORTUNE_FILE = "/usr/share/fortune/cookie"
44

45
46
47
48
# setup logging
logformat = "%(levelname)s: %(message)s"
logging.basicConfig(format=logformat, level=LOG_LEVEL)
log = logging.getLogger()
eric's avatar
eric committed
49
50

if len(sys.argv) != 2:
51
	log.error("Missing output filename argument")
eric's avatar
eric committed
52
53
54
55
56
	raise SystemExit

# make sure the seed file exists
#
if not os.path.exists(SEED_FILE):
57
	log.error("Please install the 'words' Arch package")
58
	raise SystemExit
eric's avatar
eric committed
59

60
61
# make sure comments can be created
#
62
if not os.path.exists(FORTUNE_FILE):
63
64
65
	log.error("Please install the 'fortune-mod' Arch package")
	raise SystemExit

eric's avatar
eric committed
66
67
68
69
70
71
72
73
# track what users/package names have been used
#
seen_users = {}
seen_pkgs = {}
user_keys = []

# some functions to generate random data
#
74
def genVersion():
eric's avatar
eric committed
75
76
77
	ver = []
	ver.append("%d" % random.randrange(0,10))
	ver.append("%d" % random.randrange(0,20))
eric's avatar
eric committed
78
	if random.randrange(0,2) == 0:
eric's avatar
eric committed
79
		ver.append("%d" % random.randrange(0,100))
80
	return ".".join(ver) + "-%d" % random.randrange(1,11)
eric's avatar
eric committed
81
def genCategory():
82
	return random.randrange(1,CATEGORIES_COUNT)
eric's avatar
eric committed
83
84
def genUID():
	return seen_users[user_keys[random.randrange(0,len(user_keys))]]
85
86
def genFortune():
	return fortunes[random.randrange(0,len(fortunes))].replace("'", "")
eric's avatar
eric committed
87
88
89
90


# load the words, and make sure there are enough words for users/pkgs
#
91
log.debug("Grabbing words from seed file...")
92
fp = open(SEED_FILE, "r", encoding="utf-8")
eric's avatar
eric committed
93
94
95
96
97
98
99
100
101
102
103
104
105
contents = fp.readlines()
fp.close()
if MAX_USERS > len(contents):
	MAX_USERS = len(contents)
if MAX_PKGS > len(contents):
	MAX_PKGS = len(contents)
if len(contents) - MAX_USERS > MAX_PKGS:
	need_dupes = 0
else:
	need_dupes = 1

# select random usernames
#
106
log.debug("Generating random user names...")
eric's avatar
eric committed
107
108
109
user_id = USER_ID
while len(seen_users) < MAX_USERS:
	user = random.randrange(0, len(contents))
eric's avatar
eric committed
110
111
	word = contents[user].replace("'", "").replace(".","").replace(" ", "_")
	word = word.strip().lower()
112
	if word not in seen_users:
eric's avatar
eric committed
113
114
		seen_users[word] = user_id
		user_id += 1
115
user_keys = list(seen_users.keys())
eric's avatar
eric committed
116
117
118

# select random package names
#
119
log.debug("Generating random package names...")
eric's avatar
eric committed
120
121
122
num_pkgs = PKG_ID
while len(seen_pkgs) < MAX_PKGS:
	pkg = random.randrange(0, len(contents))
eric's avatar
eric committed
123
124
	word = contents[pkg].replace("'", "").replace(".","").replace(" ", "_")
	word = word.strip().lower()
eric's avatar
eric committed
125
	if not need_dupes:
126
		if word not in seen_pkgs and word not in seen_users:
eric's avatar
eric committed
127
128
129
			seen_pkgs[word] = num_pkgs
			num_pkgs += 1
	else:
130
		if word not in seen_pkgs:
eric's avatar
eric committed
131
132
133
			seen_pkgs[word] = num_pkgs
			num_pkgs += 1

eric's avatar
eric committed
134
135
136
137
# free up contents memory
#
contents = None

138
139
140
141
142
143
144
# developer/tu IDs
#
developers = []
trustedusers = []
has_devs = 0
has_tus = 0

145
146
# Just let python throw the errors if any happen
#
147
out = open(sys.argv[1], "w", encoding="utf-8")
148
149
out.write("BEGIN;\n")

eric's avatar
eric committed
150
151
# Begin by creating the User statements
#
152
log.debug("Creating SQL statements for users.")
eric's avatar
eric committed
153
for u in user_keys:
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
	account_type = 1  # default to normal user
	if not has_devs or not has_tus:
		account_type = random.randrange(1, 4)
		if account_type == 3 and not has_devs:
			# this will be a dev account
			#
			developers.append(seen_users[u])
			if len(developers) >= MAX_DEVS * MAX_USERS:
				has_devs = 1
		elif account_type == 2 and not has_tus:
			# this will be a trusted user account
			#
			trustedusers.append(seen_users[u])
			if len(trustedusers) >= MAX_TUS * MAX_USERS:
				has_tus = 1
		else:
			# a normal user account
			#
			pass
173

174
175
	h = hashlib.new('md5')
	h.update(u.encode());
176
	s = ("INSERT INTO Users (ID, AccountTypeID, Username, Email, Passwd)"
177
178
		 " VALUES (%d, %d, '%s', '%s@example.com', '%s');\n")
	s = s % (seen_users[u], account_type, u, u, h.hexdigest())
eric's avatar
eric committed
179
	out.write(s)
180
181
182
183
184

log.debug("Number of developers: %d" % len(developers))
log.debug("Number of trusted users: %d" % len(trustedusers))
log.debug("Number of users: %d" % (MAX_USERS-len(developers)-len(trustedusers)))
log.debug("Number of packages: %d" % MAX_PKGS)
eric's avatar
eric committed
185

186
log.debug("Gathering text from fortune file...")
187
fp = open(FORTUNE_FILE, "r", encoding="utf-8")
188
189
190
fortunes = fp.read().split("%\n")
fp.close()

eric's avatar
eric committed
191
192
# Create the package statements
#
193
log.debug("Creating SQL statements for packages.")
eric's avatar
eric committed
194
count = 0
195
for p in list(seen_pkgs.keys()):
eric's avatar
eric committed
196
	NOW = int(time.time())
197
198
	if count % 2 == 0:
		muid = developers[random.randrange(0,len(developers))]
199
		puid = developers[random.randrange(0,len(developers))]
200
201
	else:
		muid = trustedusers[random.randrange(0,len(trustedusers))]
202
		puid = trustedusers[random.randrange(0,len(trustedusers))]
203
	if count % 20 == 0: # every so often, there are orphans...
204
		muid = "NULL"
205

eric's avatar
eric committed
206
207
	uuid = genUID() # the submitter/user

208
209
	s = ("INSERT INTO PackageBases (ID, Name, FlaggerComment, SubmittedTS, ModifiedTS, "
         "SubmitterUID, MaintainerUID, PackagerUID) VALUES (%d, '%s', '', %d, %d, %d, %s, %s);\n")
210
	s = s % (seen_pkgs[p], p, NOW, NOW, uuid, muid, puid)
211
	out.write(s)
212

213
214
215
	s = ("INSERT INTO Packages (ID, PackageBaseID, Name, Version) VALUES "
         "(%d, %d, '%s', '%s');\n")
	s = s % (seen_pkgs[p], seen_pkgs[p], p, genVersion())
eric's avatar
eric committed
216
	out.write(s)
217

eric's avatar
eric committed
218
219
	count += 1

eric's avatar
eric committed
220
221
222
223
224
	# create random comments for this package
	#
	num_comments = random.randrange(PKG_CMNTS[0], PKG_CMNTS[1])
	for i in range(0, num_comments):
		now = NOW + random.randrange(400, 86400*3)
225
		s = ("INSERT INTO PackageComments (PackageBaseID, UsersID,"
226
			 " Comments, RenderedComment, CommentTS) VALUES (%d, %d, '%s', '', %d);\n")
227
		s = s % (seen_pkgs[p], genUID(), genFortune(), now)
eric's avatar
eric committed
228
229
		out.write(s)

eric's avatar
eric committed
230
231
# Cast votes
#
232
track_votes = {}
233
log.debug("Casting votes for packages.")
eric's avatar
eric committed
234
for u in user_keys:
235
236
	num_votes = random.randrange(int(len(seen_pkgs)*VOTING[0]),
			int(len(seen_pkgs)*VOTING[1]))
eric's avatar
eric committed
237
238
	pkgvote = {}
	for v in range(num_votes):
239
		pkg = random.randrange(1, len(seen_pkgs) + 1)
240
		if pkg not in pkgvote:
241
			s = ("INSERT INTO PackageVotes (UsersID, PackageBaseID)"
242
243
				 " VALUES (%d, %d);\n")
			s = s % (seen_users[u], pkg)
eric's avatar
eric committed
244
			pkgvote[pkg] = 1
245
			if pkg not in track_votes:
246
247
				track_votes[pkg] = 0
			track_votes[pkg] += 1
eric's avatar
eric committed
248
249
			out.write(s)

250
251
# Update statements for package votes
#
252
for p in list(track_votes.keys()):
253
	s = "UPDATE PackageBases SET NumVotes = %d WHERE ID = %d;\n"
254
	s = s % (track_votes[p], p)
255
256
	out.write(s)

257
258
# Create package dependencies and sources
#
259
log.debug("Creating statements for package depends/sources.")
260
for p in list(seen_pkgs.keys()):
261
	num_deps = random.randrange(PKG_DEPS[0], PKG_DEPS[1])
262
	for i in range(0, num_deps):
263
		dep = random.choice([k for k in seen_pkgs])
264
265
266
		deptype = random.randrange(1, 5)
		if deptype == 4:
			dep += ": for " + random.choice([k for k in seen_pkgs])
267
		s = "INSERT INTO PackageDepends(PackageID, DepTypeID, DepName) VALUES (%d, %d, '%s');\n"
268
269
270
271
272
273
274
		s = s % (seen_pkgs[p], deptype, dep)
		out.write(s)

	num_rels = random.randrange(PKG_RELS[0], PKG_RELS[1])
	for i in range(0, num_deps):
		rel = random.choice([k for k in seen_pkgs])
		reltype = random.randrange(1, 4)
275
		s = "INSERT INTO PackageRelations(PackageID, RelTypeID, RelName) VALUES (%d, %d, '%s');\n"
276
277
		s = s % (seen_pkgs[p], reltype, rel)
		out.write(s)
278
279
280
281
282
283
284
285
286

	num_sources = random.randrange(PKG_SRC[0], PKG_SRC[1])
	for i in range(num_sources):
		src_file = user_keys[random.randrange(0, len(user_keys))]
		src = "%s%s.%s/%s/%s-%s.tar.gz" % (
				RANDOM_URL[random.randrange(0,len(RANDOM_URL))],
				p, RANDOM_TLDS[random.randrange(0,len(RANDOM_TLDS))],
				RANDOM_LOCS[random.randrange(0,len(RANDOM_LOCS))],
				src_file, genVersion())
287
		s = "INSERT INTO PackageSources(PackageID, Source) VALUES (%d, '%s');\n"
288
		s = s % (seen_pkgs[p], src)
289
290
		out.write(s)

291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
# Create trusted user proposals
#
log.debug("Creating SQL statements for trusted user proposals.")
count=0
for t in range(0, OPEN_PROPOSALS+CLOSE_PROPOSALS):
	now = int(time.time())
	if count < CLOSE_PROPOSALS:
		start =  now - random.randrange(3600*24*7, 3600*24*21)
		end = now - random.randrange(0, 3600*24*7)
	else:
		start = now
		end = now + random.randrange(3600*24, 3600*24*7)
	if count % 5 == 0: # Don't make the vote about anyone once in a while
		user = ""
	else:
		user = user_keys[random.randrange(0,len(user_keys))]
	suid = trustedusers[random.randrange(0,len(trustedusers))]
	s = ("INSERT INTO TU_VoteInfo (Agenda, User, Submitted, End,"
309
	" Quorum, SubmitterID) VALUES ('%s', '%s', %d, %d, 0.0, %d);\n")
310
	s = s % (genFortune(), user, start, end, suid)
311
312
313
	out.write(s)
	count += 1

eric's avatar
eric committed
314
315
# close output file
#
316
out.write("COMMIT;\n")
eric's avatar
eric committed
317
318
out.write("\n")
out.close()
319
log.debug("Done.")