reporead: implement delayed parsing of files data

This gives us some large memory savings in python due to the internal
storage of Unicode strings vs. byte strings, as well as saving us
processing time up front for filelist data we are never going to have to
actually use.

Signed-off-by: Dan McGee <dan@archlinux.org>
This commit is contained in:
Dan McGee 2013-12-17 08:26:47 -06:00
parent 8e8e3d7aa7
commit ac157895f1

View File

@ -82,8 +82,7 @@ class RepoPackage(object):
'md5sum', 'sha256sum', 'url', 'packager' )
number = ( 'csize', 'isize' )
collections = ( 'depends', 'optdepends', 'makedepends', 'checkdepends',
'conflicts', 'provides', 'replaces', 'groups', 'license',
'files' )
'conflicts', 'provides', 'replaces', 'groups', 'license')
def __init__(self, repo):
self.repo = repo
@ -98,7 +97,6 @@ def __init__(self, repo):
setattr(self, k, ())
self.builddate = None
self.files = None
self.has_files = False
def populate(self, values):
for k, v in values.iteritems():
@ -120,13 +118,21 @@ def populate(self, values):
logger.warning(
'Package %s had unparsable build date %s',
self.name, v[0])
elif k == 'files':
self.files = tuple(v)
self.has_files = True
else:
# anything left in collections
setattr(self, k, tuple(v))
@property
def files_list(self):
data_file = io.TextIOWrapper(io.BytesIO(self.files), encoding='UTF-8')
try:
info = parse_info(data_file)
except UnicodeDecodeError:
logger.warn("Could not correctly decode files list for %s",
self.name)
return None
return info['files']
@property
def full_version(self):
'''Very similar to the main.models.Package method.'''
@ -291,15 +297,18 @@ def populate_files(dbpkg, repopkg, force=False):
return
# only delete files if we are reading a DB that contains them
if repopkg.has_files:
if repopkg.files:
files = repopkg.files_list
# we had files data, but it couldn't be parsed, so skip
if not files:
return
delete_pkg_files(dbpkg)
logger.info("adding %d files for package %s",
len(repopkg.files), dbpkg.pkgname)
len(files), dbpkg.pkgname)
pkg_files = []
# sort in normal alpha-order that pacman uses, rather than makepkg's
# default breadth-first, directory-first ordering
files = sorted(repopkg.files)
for f in files:
for f in sorted(files):
if '/' in f:
dirname, filename = f.rsplit('/', 1)
dirname += '/'
@ -507,24 +516,27 @@ def parse_repo(repopath):
repodb = tarfile.open(repopath, "r")
logger.debug("Starting package parsing")
dbfiles = ('desc', 'depends', 'files')
newpkg = lambda: RepoPackage(reponame)
pkgs = defaultdict(newpkg)
for tarinfo in repodb.getmembers():
if tarinfo.isreg():
pkgid, fname = os.path.split(tarinfo.name)
if fname not in dbfiles:
continue
data_file = repodb.extractfile(tarinfo)
data_file = io.TextIOWrapper(io.BytesIO(data_file.read()),
encoding='UTF-8')
try:
pkgs[pkgid].populate(parse_info(data_file))
except UnicodeDecodeError:
logger.warn("Could not correctly decode %s, skipping file",
tarinfo.name)
data_file.close()
del data_file
if fname == 'files':
# don't parse yet for speed and memory consumption reasons
files_data = repodb.extractfile(tarinfo)
pkgs[pkgid].files = files_data.read()
del files_data
elif fname in ('desc', 'depends'):
data_file = repodb.extractfile(tarinfo)
data_file = io.TextIOWrapper(io.BytesIO(data_file.read()),
encoding='UTF-8')
try:
pkgs[pkgid].populate(parse_info(data_file))
except UnicodeDecodeError:
logger.warn("Could not correctly decode %s, skipping file",
tarinfo.name)
data_file.close()
del data_file
logger.debug("Done parsing file %s/%s", pkgid, fname)