reporead: revamp database parsing code

This needed a little sprucing up as it has grown quite organically over the
life of this script. Make things a bit more pythonic through the use of
iterators rather than collection indexing, and try to generalize the special
cases of things a bit.

Also catch encoding problems early and fail gracefully rather than blow up
the entire package parser. A failed decode of a file should cause us to just
skip it rather than stop the entire parser.  Worst case, this leaves that
package out of the web interface.

Signed-off-by: Dan McGee <dan@archlinux.org>
This commit is contained in:
Dan McGee 2010-09-12 12:24:23 -05:00
parent 51fca0c98d
commit 5ca2cea388

View File

@ -13,19 +13,12 @@
./manage.py reporead i686 /tmp/core.db.tar.gz
"""
# multi value blocks
REPOVARS = ['arch', 'backup', 'base', 'builddate', 'conflicts', 'csize',
'deltas', 'depends', 'desc', 'filename', 'files', 'force',
'groups', 'installdate', 'isize', 'license', 'md5sum',
'name', 'optdepends', 'packager', 'provides', 'reason',
'replaces', 'size', 'url', 'version']
from django.core.management.base import BaseCommand, CommandError
from django.contrib.auth.models import User
from django.db import transaction
from django.db.models import Q
import codecs
import os
import re
import sys
@ -34,7 +27,6 @@
from datetime import datetime
from optparse import make_option
from cStringIO import StringIO
from logging import ERROR, WARNING, INFO, DEBUG
from main.models import Arch, Package, Repo
@ -85,49 +77,36 @@ def handle(self, arch=None, filename=None, **options):
class Pkg(object):
"""An interim 'container' object for holding Arch package data."""
bare = ( 'name', 'base', 'arch', 'desc', 'filename',
'md5sum', 'url', 'builddate', 'packager' )
squash = ( 'license', )
number = ( 'csize', 'isize' )
def __init__(self, val, repo):
selfdict = {}
squash = ['arch', 'builddate', 'csize', 'desc', 'filename',
'installdate', 'isize', 'license', 'md5sum',
'packager', 'size', 'url']
selfdict['name'] = val['name'][0]
selfdict['base'] = None
del val['name']
if 'license' not in val:
val['license'] = []
for x in val.keys():
if x in squash:
if val[x] == None or len(val[x]) == 0:
logger.warning("Package %s has no %s" % (selfdict['name'], x))
selfdict[x] = None
else:
selfdict[x] = ', '.join(val[x])
# make sure we don't have elements larger than the db char
# fields
if len(selfdict[x]) > 255:
selfdict[x] = selfdict[x][:254]
elif x == 'base':
selfdict[x] = val[x][0]
elif x == 'force':
selfdict[x] = True
elif x == 'version':
version = val[x][0].rsplit('-')
selfdict['ver'] = version[0]
selfdict['rel'] = version[1]
elif x == 'reason':
selfdict[x] = int(val[x][0])
else:
selfdict[x] = val[x]
self.__dict__ = selfdict
def __init__(self, repo):
self.repo = repo
self.ver = None
self.rel = None
for k in self.bare + self.squash + self.number:
setattr(self, k, None)
def __getattr__(self, name):
if name == 'force':
return False
else:
return None
def populate(self, values):
for k, v in values.iteritems():
# ensure we stay under our DB character limit
if k in self.bare:
setattr(self, k, v[0][:254])
elif k in self.squash:
setattr(self, k, u', '.join(v)[:254])
elif k in self.number:
setattr(self, k, long(v[0]))
elif k == 'force':
setattr(self, k, True)
elif k == 'version':
ver, rel = v[0].rsplit('-')
setattr(self, 'ver', ver)
setattr(self, 'rel', rel)
else:
# files, depends, etc.
setattr(self, k, v)
def find_user(userstring):
@ -187,8 +166,8 @@ def populate_pkg(dbpkg, repopkg, force=False, timestamp=None):
dbpkg.license = repopkg.license
dbpkg.url = repopkg.url
dbpkg.filename = repopkg.filename
dbpkg.compressed_size = int(repopkg.csize)
dbpkg.installed_size = int(repopkg.isize)
dbpkg.compressed_size = repopkg.csize
dbpkg.installed_size = repopkg.isize
try:
dbpkg.build_date = datetime.utcfromtimestamp(int(repopkg.builddate))
except ValueError:
@ -335,33 +314,24 @@ def db_update(archname, reponame, pkgs, options):
logger.info('Finished updating Arch: %s' % archname)
def parse_inf(iofile):
def parse_info(iofile):
"""
Parses an Arch repo db information file, and returns variables as a list.
Arguments:
iofile -- A StringIO, FileType, or other object with readlines method.
"""
store = {}
lines = iofile.readlines()
blockname = None
max_len = len(lines)
i = 0
while i < max_len:
line = lines[i].strip()
if len(line) > 0 and line[0] == '%' and line[1:-1].lower() in REPOVARS:
for line in iofile:
line = line.strip()
if len(line) == 0:
continue
elif line.startswith('%') and line.endswith('%'):
blockname = line[1:-1].lower()
logger.debug("Parsing package block %s", blockname)
store[blockname] = []
i += 1
while i < max_len and len(lines[i].strip()) > 0:
store[blockname].append(lines[i].strip())
i += 1
# here is where i would convert arrays to strings
# based on count and type, but i dont think it is needed now
i += 1
elif blockname:
store[blockname].append(line)
else:
raise Exception("Read package info outside a block: %s" % line)
return store
@ -391,29 +361,30 @@ def parse_repo(repopath):
## repo-add enforces this
logger.debug("Starting package parsing")
dbfiles = ('desc', 'depends', 'files')
pkgs = []
tpkg = None
while True:
tarinfo = repodb.next()
if tarinfo == None or tarinfo.isdir():
if tpkg != None:
tpkg.reset()
data = parse_inf(tpkg)
p = Pkg(data, reponame)
logger.debug("Done parsing package %s", p.name)
pkgs.append(p)
if tarinfo == None:
break
# set new tpkg
tpkg = StringIO()
if tarinfo.isreg():
fname = os.path.split(tarinfo.name)[1]
if fname in dbfiles:
tpkg.write(repodb.extractfile(tarinfo).read())
tpkg.write('\n') # just in case
pkgs = {}
for tarinfo in repodb.getmembers():
if tarinfo.isdir():
continue
elif tarinfo.isreg():
pkgid, fname = os.path.split(tarinfo.name)
if fname not in dbfiles:
continue
data_file = repodb.extractfile(tarinfo)
data_file = codecs.EncodedFile(data_file, 'utf-8')
try:
data = parse_info(data_file)
p = pkgs.setdefault(pkgid, Pkg(reponame))
p.populate(data)
except UnicodeDecodeError, e:
logger.warn("Could not correctly decode %s, skipping file" % \
tarinfo.name)
data_file.close()
logger.debug("Done parsing file %s", fname)
repodb.close()
logger.info("Finished repo parsing")
return (reponame, pkgs)
logger.info("Finished repo parsing, %d total packages" % len(pkgs))
return (reponame, pkgs.values())
def validate_arch(arch):
"Check if arch is valid."