reporead: bring back batched_bulk_create()

For packages with filelists with > 80,000 items, we were starting to see
some serious memory issues in reporead. This was both on the statement
generation side in Python as well as on the database side. Break the
updates into chunks of 10,000 when we encounter packages with tons of
files to control things in a bit.

Signed-off-by: Dan McGee <dan@archlinux.org>
This commit is contained in:
Dan McGee 2013-12-17 08:24:41 -06:00
parent 77a45dc7bc
commit 8e8e3d7aa7

View File

@ -260,6 +260,24 @@ def delete_pkg_files(dbpkg):
cursor.execute('DELETE FROM package_files WHERE pkg_id = %s', [dbpkg.id])
def batched_bulk_create(model, all_objects):
cutoff = 10000
length = len(all_objects)
if length < cutoff:
return model.objects.bulk_create(all_objects)
def chunks():
offset = 0
while offset < length:
yield all_objects[offset:offset + cutoff]
offset += cutoff
for items in chunks():
ret = model.objects.bulk_create(items)
return ret
def populate_files(dbpkg, repopkg, force=False):
if not force:
if not pkg_same_version(repopkg, dbpkg):
@ -294,7 +312,7 @@ def populate_files(dbpkg, repopkg, force=False):
directory=dirname,
filename=filename)
pkg_files.append(pkgfile)
PackageFile.objects.bulk_create(pkg_files)
batched_bulk_create(PackageFile, pkg_files)
dbpkg.files_last_update = now()
dbpkg.save()