385 lines
12 KiB
Python
Executable File
385 lines
12 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
reporead.py
|
|
|
|
Parses a repo.db.tar.gz file and updates the Arch database with the relevant
|
|
changes.
|
|
|
|
Usage: reporead.py ARCH PATH
|
|
ARCH: architecture to update, and can be one of: i686, x86_64
|
|
PATH: full path to the repo.db.tar.gz file.
|
|
|
|
Example:
|
|
reporead.py i686 /tmp/core.db.tar.gz
|
|
|
|
"""
|
|
|
|
###
|
|
### User Variables
|
|
###
|
|
|
|
# multi value blocks
|
|
REPOVARS = ['arch', 'backup', 'builddate', 'conflicts', 'csize',
|
|
'deltas', 'depends', 'desc', 'filename', 'files', 'force',
|
|
'groups', 'installdate', 'isize', 'license', 'md5sum',
|
|
'name', 'optdepends', 'packager', 'provides', 'reason',
|
|
'replaces', 'size', 'url', 'version']
|
|
|
|
###
|
|
### Imports
|
|
###
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
import gzip
|
|
import tarfile
|
|
import logging
|
|
from datetime import datetime
|
|
from django.core.management import setup_environ
|
|
# mung the sys path to get to django root dir, no matter
|
|
# where we are called from
|
|
archweb_app_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
|
|
os.chdir(archweb_app_path)
|
|
sys.path[0] = archweb_app_path
|
|
import settings
|
|
setup_environ(settings)
|
|
from pprint import pprint as pp
|
|
from cStringIO import StringIO
|
|
from logging import CRITICAL,ERROR,WARNING,INFO,DEBUG
|
|
from main.models import Arch, Package, PackageFile, PackageDepend, Repo
|
|
|
|
class SomethingFishyException(Exception):
|
|
'''Raised when the database looks like its going to wipe out a bunch of
|
|
packages.'''
|
|
pass
|
|
|
|
###
|
|
### Initialization
|
|
###
|
|
|
|
logging.basicConfig(
|
|
level=WARNING,
|
|
format='%(asctime)s -> %(levelname)s: %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S',
|
|
stream=sys.stderr)
|
|
logger = logging.getLogger()
|
|
|
|
|
|
###
|
|
### function and class definitions
|
|
###
|
|
|
|
class Pkg(object):
|
|
"""An interim 'container' object for holding Arch package data."""
|
|
|
|
def __init__(self, val):
|
|
selfdict = {}
|
|
squash = ['arch', 'builddate', 'csize', 'desc', 'filename',
|
|
'installdate', 'isize', 'license', 'md5sum',
|
|
'packager', 'size', 'url']
|
|
|
|
selfdict['name'] = val['name'][0]
|
|
del val['name']
|
|
if 'desc' not in val:
|
|
logger.warning("Package %s has no description" % selfdict['name'])
|
|
val['desc'] = ''
|
|
if 'url' not in val:
|
|
val['url'] = ''
|
|
for x in val.keys():
|
|
if x in squash:
|
|
if len(val[x]) == 0:
|
|
logger.warning("Package %s has no %s" % (selfdict['name'],x))
|
|
selfdict[x] = ''.join(val[x])
|
|
# make sure we don't have elements larger than the db char
|
|
# fields
|
|
if len(selfdict[x]) > 255:
|
|
selfdict[x] = selfdict[x][:254]
|
|
elif x == 'force':
|
|
selfdict[x] = True
|
|
elif x == 'version':
|
|
version = val[x][0].rsplit('-')
|
|
selfdict['ver'] = version[0]
|
|
selfdict['rel'] = version[1]
|
|
elif x == 'reason':
|
|
selfdict[x] = int(val[x][0])
|
|
else:
|
|
selfdict[x] = val[x]
|
|
self.__dict__ = selfdict
|
|
|
|
def __getattr__(self,name):
|
|
if name == 'force':
|
|
return False
|
|
else:
|
|
return None
|
|
|
|
|
|
def usage():
|
|
"""Print the usage of this application."""
|
|
print __doc__.strip()
|
|
|
|
|
|
def fetchiter_dict(cursor):
|
|
"""
|
|
Given a DB API 2.0 cursor object that has been executed, returns a
|
|
dictionary that maps each field to a column index
|
|
"""
|
|
rows = cursor.fetchmany(size=30)
|
|
while rows:
|
|
for row in rows:
|
|
#pp(rows)
|
|
#for row in rows:
|
|
yield dictize(cursor,row)
|
|
rows = cursor.fetchmany(size=30)
|
|
|
|
|
|
def fetchone_dict(cursor):
|
|
"""
|
|
Given a DB API 2.0 cursor object that has been executed, returns a
|
|
dictionary that maps each field to a column index
|
|
"""
|
|
results = {}
|
|
row = cursor.fetchone()
|
|
return dictize(cursor,row)
|
|
|
|
|
|
def dictize(cursor,row):
|
|
result = {}
|
|
for column,desc in enumerate(cursor.description):
|
|
result[desc[0]] = row[column]
|
|
return result
|
|
|
|
|
|
def db_update(archname, pkgs):
|
|
"""
|
|
Parses a list and updates the Arch dev database accordingly.
|
|
|
|
Arguments:
|
|
pkgs -- A list of Pkg objects.
|
|
|
|
"""
|
|
logger.info('Updating Arch: %s' % archname)
|
|
repository = Repo.objects.get(name__iexact=pkgs[0].repo)
|
|
architecture = Arch.objects.get(name__iexact=archname)
|
|
dbpkgs = Package.objects.filter(arch=architecture, repo=repository)
|
|
now = datetime.now()
|
|
|
|
# go go set theory!
|
|
# thank you python for having a set class <3
|
|
logger.debug("Creating sets")
|
|
dbset = set([pkg.pkgname for pkg in dbpkgs])
|
|
syncset = set([pkg.name for pkg in pkgs])
|
|
logger.info("%d packages in current web DB" % len(dbset))
|
|
logger.info("%d packages in new updating db" % len(syncset))
|
|
# packages in syncdb and not in database (add to database)
|
|
logger.debug("Set theory: Packages in syncdb not in database")
|
|
in_sync_not_db = syncset - dbset
|
|
logger.info("%d packages in sync not db" % len(in_sync_not_db))
|
|
|
|
# Try to catch those random orphaning issues that make Eric so unhappy.
|
|
if len(syncset) < len(dbset) * .5:
|
|
logger.error(".db.tar.gz has less than 50% the number of packages in the web database")
|
|
raise SomethingFishyException(
|
|
'it looks like the syncdb is twice as big as the new'
|
|
'packages. WTF?')
|
|
|
|
if len(syncset) < len(dbset) * .75:
|
|
logger.warning(".db.tar.gz has 75% the number of packages in the web database.")
|
|
|
|
for p in [x for x in pkgs if x.name in in_sync_not_db]:
|
|
logger.debug("Adding package %s", p.name)
|
|
## note: maintainer is being set to orphan for now
|
|
## maybe later we can add logic to match pkgbuild maintainers
|
|
## to db maintainer ids
|
|
pkg = Package(
|
|
repo = repository, arch=architecture, maintainer_id = 0,
|
|
needupdate = False, url = p.url, last_update = now,
|
|
pkgname = p.name, pkgver = p.ver, pkgrel = p.rel,
|
|
pkgdesc = p.desc)
|
|
pkg.save()
|
|
# files are not in the repo.db.tar.gz
|
|
#for x in p.files:
|
|
# pkg.packagefile_set.create(path=x)
|
|
if 'depends' in p.__dict__:
|
|
for y in p.depends:
|
|
# make sure we aren't adding self depends..
|
|
# yes *sigh* i have seen them in pkgbuilds
|
|
dpname,dpvcmp = re.match(r"([a-z0-9._-]+)(.*)", y).groups()
|
|
if dpname == p.name:
|
|
logger.warning('Package %s has a depend on itself' % p.name)
|
|
continue
|
|
pkg.packagedepend_set.create(depname=dpname, depvcmp=dpvcmp)
|
|
logger.debug('Added %s as dep for pkg %s' % (dpname,p.name))
|
|
|
|
# packages in database and not in syncdb (remove from database)
|
|
logger.debug("Set theory: Packages in database not in syncdb")
|
|
in_db_not_sync = dbset - syncset
|
|
for p in in_db_not_sync:
|
|
logger.info("Removing package %s from database", p)
|
|
Package.objects.get(
|
|
pkgname=p, arch=architecture, repo=repository).delete()
|
|
|
|
# packages in both database and in syncdb (update in database)
|
|
logger.debug("Set theory: Packages in database and syncdb")
|
|
pkg_in_both = syncset & dbset
|
|
for p in [x for x in pkgs if x.name in pkg_in_both]:
|
|
logger.debug("Looking for package updates")
|
|
dbp = dbpkgs.get(pkgname=p.name)
|
|
if ''.join((p.ver,p.rel)) == ''.join((dbp.pkgver,dbp.pkgrel)):
|
|
continue
|
|
logger.info("Updating package %s in database", p.name)
|
|
pkg = Package.objects.get(
|
|
pkgname=p.name,arch=architecture, repo=repository)
|
|
pkg.pkgver = p.ver
|
|
pkg.pkgrel = p.rel
|
|
pkg.pkgdesc = p.desc
|
|
pkg.url = p.url
|
|
pkg.needupdate = False
|
|
pkg.last_update = now
|
|
pkg.save()
|
|
|
|
# files are not in the repo.db.tar.gz
|
|
#pkg.packagefile_set.all().delete()
|
|
#for x in p.files:
|
|
# pkg.packagefile_set.create(path=x)
|
|
pkg.packagedepend_set.all().delete()
|
|
if 'depends' in p.__dict__:
|
|
for y in p.depends:
|
|
dpname,dpvcmp = re.match(r"([a-z0-9-]+)(.*)", y).groups()
|
|
pkg.packagedepend_set.create(depname=dpname, depvcmp=dpvcmp)
|
|
logger.info('Finished updating Arch: %s' % archname)
|
|
|
|
|
|
def parse_inf(iofile):
|
|
"""
|
|
Parses an Arch repo db information file, and returns variables as a list.
|
|
|
|
Arguments:
|
|
iofile -- A StringIO, FileType, or other object with readlines method.
|
|
|
|
"""
|
|
store = {}
|
|
lines = iofile.readlines()
|
|
blockname = None
|
|
max = len(lines)
|
|
i = 0
|
|
while i < max:
|
|
line = lines[i].strip()
|
|
if len(line) > 0 and line[0] == '%' and line[1:-1].lower() in REPOVARS:
|
|
blockname = line[1:-1].lower()
|
|
logger.debug("Parsing package block %s",blockname)
|
|
store[blockname] = []
|
|
i += 1
|
|
while i < max and len(lines[i].strip()) > 0:
|
|
store[blockname].append(lines[i].strip())
|
|
i += 1
|
|
# here is where i would convert arrays to strings
|
|
# based on count and type, but i dont think it is needed now
|
|
i += 1
|
|
|
|
return store
|
|
|
|
|
|
def parse_repo(repopath):
|
|
"""
|
|
Parses an Arch repo db file, and returns a list of Pkg objects.
|
|
|
|
Arguments:
|
|
repopath -- The path of a repository db file.
|
|
|
|
"""
|
|
logger.info("Starting repo parsing")
|
|
if not os.path.exists(repopath):
|
|
logger.error("Could not read file %s", repopath)
|
|
|
|
logger.info("Reading repo tarfile")
|
|
filename = os.path.split(repopath)[1]
|
|
rindex = filename.rindex('.db.tar.gz')
|
|
reponame = filename[:rindex]
|
|
|
|
repodb = tarfile.open(repopath,"r:gz")
|
|
## assuming well formed tar, with dir first then files after
|
|
## repo-add enforces this
|
|
logger.debug("Starting package parsing")
|
|
pkgs = []
|
|
tpkg = None
|
|
while True:
|
|
tarinfo = repodb.next()
|
|
if tarinfo == None or tarinfo.isdir():
|
|
if tpkg != None:
|
|
tpkg.reset()
|
|
data = parse_inf(tpkg)
|
|
p = Pkg(data)
|
|
p.repo = reponame
|
|
logger.debug("Done parsing package %s", p.name)
|
|
pkgs.append(p)
|
|
if tarinfo == None:
|
|
break
|
|
# set new tpkg
|
|
tpkg = StringIO()
|
|
if tarinfo.isreg():
|
|
if os.path.split(tarinfo.name)[1] in ('desc','depends'):
|
|
tpkg.write(repodb.extractfile(tarinfo).read())
|
|
tpkg.write('\n') # just in case
|
|
repodb.close()
|
|
logger.info("Finished repo parsing")
|
|
return pkgs
|
|
|
|
|
|
def main(argv=None):
|
|
"""
|
|
Parses repo.db.tar.gz file and returns exit status.
|
|
|
|
Keyword Arguments:
|
|
argv -- A list/array simulating a sys.argv (default None)
|
|
If left empty, sys.argv is used
|
|
|
|
"""
|
|
if argv == None:
|
|
argv = sys.argv
|
|
if len(argv) != 3:
|
|
usage()
|
|
return 0
|
|
# check if arch is valid
|
|
available_arches = [x.name for x in Arch.objects.all()]
|
|
if argv[1] not in available_arches:
|
|
usage()
|
|
return 0
|
|
else:
|
|
primary_arch = argv[1]
|
|
|
|
repo_file = os.path.normpath(argv[2])
|
|
packages = parse_repo(repo_file)
|
|
|
|
# sort packages by arch -- to handle noarch stuff
|
|
packages_arches = {}
|
|
for arch in available_arches:
|
|
packages_arches[arch] = []
|
|
|
|
for package in packages:
|
|
if package.arch in ('any', primary_arch):
|
|
packages_arches[package.arch].append(package)
|
|
else:
|
|
logger.warning("Package %s arch = %s" % (
|
|
package.name,package.arch))
|
|
#package.arch = primary_arch
|
|
|
|
|
|
logger.info('Starting database updates.')
|
|
for (arch, pkgs) in packages_arches.iteritems():
|
|
if len(pkgs) > 0:
|
|
db_update(arch,pkgs)
|
|
logger.info('Finished database updates.')
|
|
return 0
|
|
|
|
|
|
###
|
|
### Main eval
|
|
###
|
|
|
|
if __name__ == '__main__':
|
|
logger.level = INFO
|
|
sys.exit(main())
|
|
|