evorepo/scripts/reporead.py
2008-09-13 11:05:21 -04:00

385 lines
12 KiB
Python
Executable File

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
reporead.py
Parses a repo.db.tar.gz file and updates the Arch database with the relevant
changes.
Usage: reporead.py ARCH PATH
ARCH: architecture to update, and can be one of: i686, x86_64
PATH: full path to the repo.db.tar.gz file.
Example:
reporead.py i686 /tmp/core.db.tar.gz
"""
###
### User Variables
###
# multi value blocks
REPOVARS = ['arch', 'backup', 'builddate', 'conflicts', 'csize',
'deltas', 'depends', 'desc', 'filename', 'files', 'force',
'groups', 'installdate', 'isize', 'license', 'md5sum',
'name', 'optdepends', 'packager', 'provides', 'reason',
'replaces', 'size', 'url', 'version']
###
### Imports
###
import os
import re
import sys
import gzip
import tarfile
import logging
from datetime import datetime
from django.core.management import setup_environ
# mung the sys path to get to django root dir, no matter
# where we are called from
archweb_app_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
os.chdir(archweb_app_path)
sys.path[0] = archweb_app_path
import settings
setup_environ(settings)
from pprint import pprint as pp
from cStringIO import StringIO
from logging import CRITICAL,ERROR,WARNING,INFO,DEBUG
from main.models import Arch, Package, PackageFile, PackageDepend, Repo
class SomethingFishyException(Exception):
'''Raised when the database looks like its going to wipe out a bunch of
packages.'''
pass
###
### Initialization
###
logging.basicConfig(
level=WARNING,
format='%(asctime)s -> %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
stream=sys.stderr)
logger = logging.getLogger()
###
### function and class definitions
###
class Pkg(object):
"""An interim 'container' object for holding Arch package data."""
def __init__(self, val):
selfdict = {}
squash = ['arch', 'builddate', 'csize', 'desc', 'filename',
'installdate', 'isize', 'license', 'md5sum',
'packager', 'size', 'url']
selfdict['name'] = val['name'][0]
del val['name']
if 'desc' not in val:
logger.warning("Package %s has no description" % selfdict['name'])
val['desc'] = ''
if 'url' not in val:
val['url'] = ''
for x in val.keys():
if x in squash:
if len(val[x]) == 0:
logger.warning("Package %s has no %s" % (selfdict['name'],x))
selfdict[x] = ''.join(val[x])
# make sure we don't have elements larger than the db char
# fields
if len(selfdict[x]) > 255:
selfdict[x] = selfdict[x][:254]
elif x == 'force':
selfdict[x] = True
elif x == 'version':
version = val[x][0].rsplit('-')
selfdict['ver'] = version[0]
selfdict['rel'] = version[1]
elif x == 'reason':
selfdict[x] = int(val[x][0])
else:
selfdict[x] = val[x]
self.__dict__ = selfdict
def __getattr__(self,name):
if name == 'force':
return False
else:
return None
def usage():
"""Print the usage of this application."""
print __doc__.strip()
def fetchiter_dict(cursor):
"""
Given a DB API 2.0 cursor object that has been executed, returns a
dictionary that maps each field to a column index
"""
rows = cursor.fetchmany(size=30)
while rows:
for row in rows:
#pp(rows)
#for row in rows:
yield dictize(cursor,row)
rows = cursor.fetchmany(size=30)
def fetchone_dict(cursor):
"""
Given a DB API 2.0 cursor object that has been executed, returns a
dictionary that maps each field to a column index
"""
results = {}
row = cursor.fetchone()
return dictize(cursor,row)
def dictize(cursor,row):
result = {}
for column,desc in enumerate(cursor.description):
result[desc[0]] = row[column]
return result
def db_update(archname, pkgs):
"""
Parses a list and updates the Arch dev database accordingly.
Arguments:
pkgs -- A list of Pkg objects.
"""
logger.info('Updating Arch: %s' % archname)
repository = Repo.objects.get(name__iexact=pkgs[0].repo)
architecture = Arch.objects.get(name__iexact=archname)
dbpkgs = Package.objects.filter(arch=architecture, repo=repository)
now = datetime.now()
# go go set theory!
# thank you python for having a set class <3
logger.debug("Creating sets")
dbset = set([pkg.pkgname for pkg in dbpkgs])
syncset = set([pkg.name for pkg in pkgs])
logger.info("%d packages in current web DB" % len(dbset))
logger.info("%d packages in new updating db" % len(syncset))
# packages in syncdb and not in database (add to database)
logger.debug("Set theory: Packages in syncdb not in database")
in_sync_not_db = syncset - dbset
logger.info("%d packages in sync not db" % len(in_sync_not_db))
# Try to catch those random orphaning issues that make Eric so unhappy.
if len(syncset) < len(dbset) * .5:
logger.error(".db.tar.gz has less than 50% the number of packages in the web database")
raise SomethingFishyException(
'it looks like the syncdb is twice as big as the new'
'packages. WTF?')
if len(syncset) < len(dbset) * .75:
logger.warning(".db.tar.gz has 75% the number of packages in the web database.")
for p in [x for x in pkgs if x.name in in_sync_not_db]:
logger.debug("Adding package %s", p.name)
## note: maintainer is being set to orphan for now
## maybe later we can add logic to match pkgbuild maintainers
## to db maintainer ids
pkg = Package(
repo = repository, arch=architecture, maintainer_id = 0,
needupdate = False, url = p.url, last_update = now,
pkgname = p.name, pkgver = p.ver, pkgrel = p.rel,
pkgdesc = p.desc)
pkg.save()
# files are not in the repo.db.tar.gz
#for x in p.files:
# pkg.packagefile_set.create(path=x)
if 'depends' in p.__dict__:
for y in p.depends:
# make sure we aren't adding self depends..
# yes *sigh* i have seen them in pkgbuilds
dpname,dpvcmp = re.match(r"([a-z0-9._-]+)(.*)", y).groups()
if dpname == p.name:
logger.warning('Package %s has a depend on itself' % p.name)
continue
pkg.packagedepend_set.create(depname=dpname, depvcmp=dpvcmp)
logger.debug('Added %s as dep for pkg %s' % (dpname,p.name))
# packages in database and not in syncdb (remove from database)
logger.debug("Set theory: Packages in database not in syncdb")
in_db_not_sync = dbset - syncset
for p in in_db_not_sync:
logger.info("Removing package %s from database", p)
Package.objects.get(
pkgname=p, arch=architecture, repo=repository).delete()
# packages in both database and in syncdb (update in database)
logger.debug("Set theory: Packages in database and syncdb")
pkg_in_both = syncset & dbset
for p in [x for x in pkgs if x.name in pkg_in_both]:
logger.debug("Looking for package updates")
dbp = dbpkgs.get(pkgname=p.name)
if ''.join((p.ver,p.rel)) == ''.join((dbp.pkgver,dbp.pkgrel)):
continue
logger.info("Updating package %s in database", p.name)
pkg = Package.objects.get(
pkgname=p.name,arch=architecture, repo=repository)
pkg.pkgver = p.ver
pkg.pkgrel = p.rel
pkg.pkgdesc = p.desc
pkg.url = p.url
pkg.needupdate = False
pkg.last_update = now
pkg.save()
# files are not in the repo.db.tar.gz
#pkg.packagefile_set.all().delete()
#for x in p.files:
# pkg.packagefile_set.create(path=x)
pkg.packagedepend_set.all().delete()
if 'depends' in p.__dict__:
for y in p.depends:
dpname,dpvcmp = re.match(r"([a-z0-9-]+)(.*)", y).groups()
pkg.packagedepend_set.create(depname=dpname, depvcmp=dpvcmp)
logger.info('Finished updating Arch: %s' % archname)
def parse_inf(iofile):
"""
Parses an Arch repo db information file, and returns variables as a list.
Arguments:
iofile -- A StringIO, FileType, or other object with readlines method.
"""
store = {}
lines = iofile.readlines()
blockname = None
max = len(lines)
i = 0
while i < max:
line = lines[i].strip()
if len(line) > 0 and line[0] == '%' and line[1:-1].lower() in REPOVARS:
blockname = line[1:-1].lower()
logger.debug("Parsing package block %s",blockname)
store[blockname] = []
i += 1
while i < max and len(lines[i].strip()) > 0:
store[blockname].append(lines[i].strip())
i += 1
# here is where i would convert arrays to strings
# based on count and type, but i dont think it is needed now
i += 1
return store
def parse_repo(repopath):
"""
Parses an Arch repo db file, and returns a list of Pkg objects.
Arguments:
repopath -- The path of a repository db file.
"""
logger.info("Starting repo parsing")
if not os.path.exists(repopath):
logger.error("Could not read file %s", repopath)
logger.info("Reading repo tarfile")
filename = os.path.split(repopath)[1]
rindex = filename.rindex('.db.tar.gz')
reponame = filename[:rindex]
repodb = tarfile.open(repopath,"r:gz")
## assuming well formed tar, with dir first then files after
## repo-add enforces this
logger.debug("Starting package parsing")
pkgs = []
tpkg = None
while True:
tarinfo = repodb.next()
if tarinfo == None or tarinfo.isdir():
if tpkg != None:
tpkg.reset()
data = parse_inf(tpkg)
p = Pkg(data)
p.repo = reponame
logger.debug("Done parsing package %s", p.name)
pkgs.append(p)
if tarinfo == None:
break
# set new tpkg
tpkg = StringIO()
if tarinfo.isreg():
if os.path.split(tarinfo.name)[1] in ('desc','depends'):
tpkg.write(repodb.extractfile(tarinfo).read())
tpkg.write('\n') # just in case
repodb.close()
logger.info("Finished repo parsing")
return pkgs
def main(argv=None):
"""
Parses repo.db.tar.gz file and returns exit status.
Keyword Arguments:
argv -- A list/array simulating a sys.argv (default None)
If left empty, sys.argv is used
"""
if argv == None:
argv = sys.argv
if len(argv) != 3:
usage()
return 0
# check if arch is valid
available_arches = [x.name for x in Arch.objects.all()]
if argv[1] not in available_arches:
usage()
return 0
else:
primary_arch = argv[1]
repo_file = os.path.normpath(argv[2])
packages = parse_repo(repo_file)
# sort packages by arch -- to handle noarch stuff
packages_arches = {}
for arch in available_arches:
packages_arches[arch] = []
for package in packages:
if package.arch in ('any', primary_arch):
packages_arches[package.arch].append(package)
else:
logger.warning("Package %s arch = %s" % (
package.name,package.arch))
#package.arch = primary_arch
logger.info('Starting database updates.')
for (arch, pkgs) in packages_arches.iteritems():
if len(pkgs) > 0:
db_update(arch,pkgs)
logger.info('Finished database updates.')
return 0
###
### Main eval
###
if __name__ == '__main__':
logger.level = INFO
sys.exit(main())