2010-09-16 14:45:30 -07:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
|
|
mirrorcheck command
|
|
|
|
|
|
|
|
Poll every active mirror URL we have in the database, grab the 'lastsync' file,
|
|
|
|
and record details about how long it took and how up to date the mirror is. If
|
|
|
|
we encounter errors, record those as well.
|
|
|
|
|
|
|
|
Usage: ./manage.py mirrorcheck
|
|
|
|
"""
|
|
|
|
|
|
|
|
import logging
|
2012-05-13 18:15:00 -07:00
|
|
|
import os
|
2010-09-16 14:45:30 -07:00
|
|
|
import re
|
|
|
|
import socket
|
2015-06-18 14:22:24 -07:00
|
|
|
import ssl
|
2012-05-13 18:15:00 -07:00
|
|
|
import subprocess
|
|
|
|
import tempfile
|
2023-11-02 04:52:30 -07:00
|
|
|
import time
|
2018-07-31 12:19:49 -07:00
|
|
|
import urllib
|
2023-11-02 04:52:30 -07:00
|
|
|
from collections import deque
|
|
|
|
from datetime import datetime, timedelta, timezone
|
|
|
|
from http.client import HTTPException
|
|
|
|
from queue import Empty, Queue
|
|
|
|
from threading import Thread
|
2010-09-16 14:45:30 -07:00
|
|
|
|
2023-11-02 04:52:30 -07:00
|
|
|
from django.conf import settings
|
2018-01-01 11:39:02 -08:00
|
|
|
from django.core.management.base import BaseCommand
|
2012-05-13 18:15:00 -07:00
|
|
|
from django.db import transaction
|
2012-07-24 07:19:48 -07:00
|
|
|
from django.utils.timezone import now
|
2012-05-13 18:15:00 -07:00
|
|
|
|
2023-11-02 04:52:30 -07:00
|
|
|
from mirrors.models import CheckLocation, MirrorLog, MirrorUrl
|
2010-09-16 14:45:30 -07:00
|
|
|
|
2021-12-23 06:02:28 -08:00
|
|
|
logger = logging.getLogger("command")
|
|
|
|
logger.setLevel(logging.WARNING)
|
2010-09-16 14:45:30 -07:00
|
|
|
|
2013-03-12 18:18:51 -07:00
|
|
|
|
2018-01-01 11:39:02 -08:00
|
|
|
class Command(BaseCommand):
|
2010-09-16 14:45:30 -07:00
|
|
|
help = "Runs a check on all known mirror URLs to determine their up-to-date status."
|
|
|
|
|
2018-01-01 11:39:02 -08:00
|
|
|
def add_arguments(self, parser):
|
|
|
|
parser.add_argument('-t',
|
|
|
|
'--timeout',
|
|
|
|
dest='timeout',
|
|
|
|
type=float,
|
|
|
|
default=10.0,
|
|
|
|
help='Timeout value for connecting to URL')
|
|
|
|
parser.add_argument('-l',
|
|
|
|
'--location',
|
|
|
|
dest='location',
|
|
|
|
type=int,
|
|
|
|
help='ID of CheckLocation object to use for this run')
|
|
|
|
|
|
|
|
def handle(self, **options):
|
2010-09-16 14:45:30 -07:00
|
|
|
v = int(options.get('verbosity', 0))
|
|
|
|
if v == 0:
|
2010-10-13 16:44:17 -07:00
|
|
|
logger.level = logging.ERROR
|
2010-09-16 14:45:30 -07:00
|
|
|
elif v == 1:
|
2010-10-13 16:44:17 -07:00
|
|
|
logger.level = logging.WARNING
|
2015-06-18 14:31:09 -07:00
|
|
|
elif v >= 2:
|
2010-10-13 16:44:17 -07:00
|
|
|
logger.level = logging.DEBUG
|
2010-09-16 14:45:30 -07:00
|
|
|
|
2013-03-06 18:58:09 -08:00
|
|
|
timeout = options.get('timeout')
|
|
|
|
|
2012-05-13 18:15:00 -07:00
|
|
|
urls = MirrorUrl.objects.select_related('protocol').filter(
|
2020-10-03 21:59:41 -07:00
|
|
|
active=True, mirror__active=True, mirror__public=True)
|
2012-05-13 18:15:00 -07:00
|
|
|
|
2013-03-06 18:58:09 -08:00
|
|
|
location = options.get('location', None)
|
|
|
|
if location:
|
|
|
|
location = CheckLocation.objects.get(id=location)
|
|
|
|
family = location.family
|
|
|
|
monkeypatch_getaddrinfo(family)
|
|
|
|
if family == socket.AF_INET6:
|
|
|
|
urls = urls.filter(has_ipv6=True)
|
|
|
|
elif family == socket.AF_INET:
|
|
|
|
urls = urls.filter(has_ipv4=True)
|
|
|
|
|
|
|
|
pool = MirrorCheckPool(urls, location, timeout)
|
2012-05-13 18:15:00 -07:00
|
|
|
pool.run()
|
2018-07-22 09:39:00 -07:00
|
|
|
pool.cleanup()
|
2012-05-13 18:15:00 -07:00
|
|
|
return 0
|
2010-09-16 14:45:30 -07:00
|
|
|
|
2012-04-29 19:26:23 -07:00
|
|
|
|
2013-03-06 18:58:09 -08:00
|
|
|
def monkeypatch_getaddrinfo(force_family=socket.AF_INET):
|
|
|
|
'''Force the Python socket module to connect over the designated family;
|
|
|
|
e.g. socket.AF_INET or socket.AF_INET6.'''
|
|
|
|
orig = socket.getaddrinfo
|
2013-03-12 18:18:51 -07:00
|
|
|
|
2013-03-06 18:58:09 -08:00
|
|
|
def wrapper(host, port, family=0, socktype=0, proto=0, flags=0):
|
|
|
|
return orig(host, port, force_family, socktype, proto, flags)
|
2013-03-12 18:18:51 -07:00
|
|
|
|
2013-03-06 18:58:09 -08:00
|
|
|
socket.getaddrinfo = wrapper
|
|
|
|
|
|
|
|
|
2012-05-13 18:15:00 -07:00
|
|
|
def parse_lastsync(log, data):
|
|
|
|
'''lastsync file should be an epoch value created by us.'''
|
|
|
|
try:
|
2023-02-12 05:06:13 -08:00
|
|
|
log.last_sync = datetime.fromtimestamp(int(data), tz=timezone.utc)
|
2014-12-11 07:06:47 -08:00
|
|
|
except (TypeError, ValueError):
|
2012-05-13 18:15:00 -07:00
|
|
|
# it is bad news to try logging the lastsync value;
|
|
|
|
# sometimes we get a crazy-encoded web page.
|
|
|
|
# if we couldn't parse a time, this is a failure.
|
|
|
|
log.last_sync = None
|
|
|
|
log.error = "Could not parse time from lastsync"
|
|
|
|
log.is_success = False
|
|
|
|
|
|
|
|
|
2013-03-06 18:58:09 -08:00
|
|
|
def check_mirror_url(mirror_url, location, timeout):
|
2010-09-16 14:45:30 -07:00
|
|
|
url = mirror_url.url + 'lastsync'
|
2011-01-31 13:10:48 -08:00
|
|
|
logger.info("checking URL %s", url)
|
2013-03-06 18:58:09 -08:00
|
|
|
log = MirrorLog(url=mirror_url, check_time=now(), location=location)
|
2012-04-29 19:26:23 -07:00
|
|
|
headers = {'User-Agent': 'archweb/1.0'}
|
2018-07-31 12:19:49 -07:00
|
|
|
req = urllib.request.Request(url, None, headers)
|
2013-04-16 20:12:01 -07:00
|
|
|
start = time.time()
|
2010-09-16 14:45:30 -07:00
|
|
|
try:
|
2018-07-31 12:19:49 -07:00
|
|
|
result = urllib.request.urlopen(req, timeout=timeout)
|
2010-09-16 14:45:30 -07:00
|
|
|
data = result.read()
|
|
|
|
result.close()
|
|
|
|
end = time.time()
|
2012-05-13 18:15:00 -07:00
|
|
|
parse_lastsync(log, data)
|
2010-09-16 14:45:30 -07:00
|
|
|
log.duration = end - start
|
2011-01-31 13:10:48 -08:00
|
|
|
logger.debug("success: %s, %.2f", url, log.duration)
|
2018-07-31 12:19:49 -07:00
|
|
|
except urllib.error.HTTPError as e:
|
2010-10-07 06:07:03 -07:00
|
|
|
if e.code == 404:
|
|
|
|
# we have a duration, just not a success
|
|
|
|
end = time.time()
|
|
|
|
log.duration = end - start
|
2010-09-16 14:45:30 -07:00
|
|
|
log.is_success = False
|
2010-10-07 06:07:03 -07:00
|
|
|
log.error = str(e)
|
2011-01-31 13:10:48 -08:00
|
|
|
logger.debug("failed: %s, %s", url, log.error)
|
2018-07-31 12:19:49 -07:00
|
|
|
except urllib.error.URLError as e:
|
2010-10-13 16:44:17 -07:00
|
|
|
log.is_success = False
|
2010-09-16 14:45:30 -07:00
|
|
|
log.error = e.reason
|
2019-01-21 11:49:05 -08:00
|
|
|
if isinstance(e.reason, str) and \
|
2010-10-07 06:07:03 -07:00
|
|
|
re.search(r'550.*No such file', e.reason):
|
|
|
|
# similar to 404 case above, still record duration
|
|
|
|
end = time.time()
|
|
|
|
log.duration = end - start
|
2010-09-16 14:45:30 -07:00
|
|
|
if isinstance(e.reason, socket.timeout):
|
|
|
|
log.error = "Connection timed out."
|
|
|
|
elif isinstance(e.reason, socket.error):
|
2014-06-10 19:04:39 -07:00
|
|
|
log.error = e.reason.args[-1]
|
2011-01-31 13:10:48 -08:00
|
|
|
logger.debug("failed: %s, %s", url, log.error)
|
2013-04-16 20:12:01 -07:00
|
|
|
except HTTPException:
|
2012-07-08 18:36:51 -07:00
|
|
|
# e.g., BadStatusLine
|
|
|
|
log.is_success = False
|
|
|
|
log.error = "Exception in processing HTTP request."
|
|
|
|
logger.debug("failed: %s, %s", url, log.error)
|
2015-06-18 14:22:24 -07:00
|
|
|
except ssl.CertificateError as e:
|
|
|
|
log.is_success = False
|
|
|
|
log.error = str(e)
|
|
|
|
logger.debug("failed: %s, %s", url, log.error)
|
2013-04-16 20:12:01 -07:00
|
|
|
except socket.timeout:
|
2010-10-13 16:44:17 -07:00
|
|
|
log.is_success = False
|
2010-10-11 18:49:04 -07:00
|
|
|
log.error = "Connection timed out."
|
2011-01-31 13:10:48 -08:00
|
|
|
logger.debug("failed: %s, %s", url, log.error)
|
2012-05-13 18:15:00 -07:00
|
|
|
except socket.error as e:
|
|
|
|
log.is_success = False
|
|
|
|
log.error = str(e)
|
|
|
|
logger.debug("failed: %s, %s", url, log.error)
|
|
|
|
|
|
|
|
return log
|
|
|
|
|
|
|
|
|
2013-03-06 18:58:09 -08:00
|
|
|
def check_rsync_url(mirror_url, location, timeout):
|
2012-05-13 18:15:00 -07:00
|
|
|
url = mirror_url.url + 'lastsync'
|
|
|
|
logger.info("checking URL %s", url)
|
2013-03-06 18:58:09 -08:00
|
|
|
log = MirrorLog(url=mirror_url, check_time=now(), location=location)
|
2012-05-13 18:15:00 -07:00
|
|
|
|
|
|
|
tempdir = tempfile.mkdtemp()
|
2013-03-06 18:58:09 -08:00
|
|
|
ipopt = ''
|
|
|
|
if location:
|
|
|
|
if location.family == socket.AF_INET6:
|
|
|
|
ipopt = '--ipv6'
|
|
|
|
elif location.family == socket.AF_INET:
|
|
|
|
ipopt = '--ipv4'
|
2012-05-13 18:15:00 -07:00
|
|
|
lastsync_path = os.path.join(tempdir, 'lastsync')
|
2020-10-03 21:59:41 -07:00
|
|
|
rsync_cmd = ["rsync", "--quiet", "--contimeout=%d" % timeout, "--timeout=%d" % timeout]
|
2013-03-06 19:38:58 -08:00
|
|
|
if ipopt:
|
|
|
|
rsync_cmd.append(ipopt)
|
|
|
|
rsync_cmd.append(url)
|
|
|
|
rsync_cmd.append(lastsync_path)
|
2012-05-13 18:15:00 -07:00
|
|
|
try:
|
|
|
|
with open(os.devnull, 'w') as devnull:
|
2013-06-01 12:53:51 -07:00
|
|
|
if logger.isEnabledFor(logging.DEBUG):
|
|
|
|
logger.debug("rsync cmd: %s", ' '.join(rsync_cmd))
|
|
|
|
start = time.time()
|
2020-10-03 21:59:41 -07:00
|
|
|
proc = subprocess.Popen(rsync_cmd, stdout=devnull, stderr=subprocess.PIPE)
|
2012-05-13 18:15:00 -07:00
|
|
|
_, errdata = proc.communicate()
|
|
|
|
end = time.time()
|
|
|
|
log.duration = end - start
|
|
|
|
if proc.returncode != 0:
|
|
|
|
logger.debug("error: %s, %s", url, errdata)
|
|
|
|
log.is_success = False
|
2019-02-20 09:21:00 -08:00
|
|
|
log.error = errdata.strip().decode('utf-8')
|
2012-05-13 18:15:00 -07:00
|
|
|
# look at rsync error code- if we had a command error or timed out,
|
|
|
|
# don't record a duration as it is misleading
|
|
|
|
if proc.returncode in (1, 30, 35):
|
|
|
|
log.duration = None
|
|
|
|
else:
|
|
|
|
logger.debug("success: %s, %.2f", url, log.duration)
|
2014-12-11 07:06:47 -08:00
|
|
|
if os.path.exists(lastsync_path):
|
|
|
|
with open(lastsync_path, 'r') as lastsync:
|
|
|
|
parse_lastsync(log, lastsync.read())
|
|
|
|
else:
|
|
|
|
parse_lastsync(log, None)
|
2012-05-13 18:15:00 -07:00
|
|
|
finally:
|
|
|
|
if os.path.exists(lastsync_path):
|
|
|
|
os.unlink(lastsync_path)
|
|
|
|
os.rmdir(tempdir)
|
2010-09-16 14:45:30 -07:00
|
|
|
|
|
|
|
return log
|
|
|
|
|
2012-04-29 19:26:23 -07:00
|
|
|
|
2013-03-06 18:58:09 -08:00
|
|
|
def mirror_url_worker(work, output, location, timeout):
|
2010-09-16 14:45:30 -07:00
|
|
|
while True:
|
|
|
|
try:
|
2012-05-13 18:15:00 -07:00
|
|
|
url = work.get(block=False)
|
2010-10-11 18:49:04 -07:00
|
|
|
try:
|
2012-05-13 18:15:00 -07:00
|
|
|
if url.protocol.protocol == 'rsync':
|
2013-03-06 18:58:09 -08:00
|
|
|
log = check_rsync_url(url, location, timeout)
|
2020-10-03 21:59:41 -07:00
|
|
|
elif (url.protocol.protocol == 'ftp' and location and location.family == socket.AF_INET6):
|
2013-03-12 18:18:51 -07:00
|
|
|
# IPv6 + FTP don't work; skip checking completely
|
|
|
|
log = None
|
2012-05-13 18:15:00 -07:00
|
|
|
else:
|
2013-03-06 18:58:09 -08:00
|
|
|
log = check_mirror_url(url, location, timeout)
|
2013-03-12 18:18:51 -07:00
|
|
|
if log:
|
|
|
|
output.append(log)
|
2010-10-11 18:49:04 -07:00
|
|
|
finally:
|
2010-12-13 11:40:01 -08:00
|
|
|
work.task_done()
|
2010-09-16 14:45:30 -07:00
|
|
|
except Empty:
|
|
|
|
return 0
|
|
|
|
|
2012-04-29 19:26:23 -07:00
|
|
|
|
2010-09-16 14:45:30 -07:00
|
|
|
class MirrorCheckPool(object):
|
2013-03-06 18:58:09 -08:00
|
|
|
def __init__(self, urls, location, timeout=10, num_threads=10):
|
2010-09-16 14:45:30 -07:00
|
|
|
self.tasks = Queue()
|
2011-02-16 19:51:23 -08:00
|
|
|
self.logs = deque()
|
2018-07-22 09:20:51 -07:00
|
|
|
for url in urls:
|
2013-03-06 18:58:09 -08:00
|
|
|
self.tasks.put(url)
|
2010-09-16 14:45:30 -07:00
|
|
|
self.threads = []
|
2015-02-26 17:36:49 -08:00
|
|
|
for _ in range(num_threads):
|
2020-10-03 21:59:41 -07:00
|
|
|
thread = Thread(target=mirror_url_worker, args=(self.tasks, self.logs, location, timeout))
|
2010-09-16 14:45:30 -07:00
|
|
|
thread.daemon = True
|
|
|
|
self.threads.append(thread)
|
|
|
|
|
2013-11-06 19:17:27 -08:00
|
|
|
@transaction.atomic
|
2010-12-13 11:40:01 -08:00
|
|
|
def run(self):
|
2010-09-16 14:45:30 -07:00
|
|
|
logger.debug("starting threads")
|
2011-04-18 13:10:20 -07:00
|
|
|
for thread in self.threads:
|
|
|
|
thread.start()
|
2010-09-16 14:45:30 -07:00
|
|
|
logger.debug("joining on all threads")
|
|
|
|
self.tasks.join()
|
2012-05-13 18:15:00 -07:00
|
|
|
logger.debug("processing %d log entries", len(self.logs))
|
2013-03-06 17:56:48 -08:00
|
|
|
MirrorLog.objects.bulk_create(self.logs)
|
2011-02-16 19:51:23 -08:00
|
|
|
logger.debug("log entries saved")
|
2010-09-16 14:45:30 -07:00
|
|
|
|
2018-07-22 09:39:00 -07:00
|
|
|
def cleanup(self):
|
|
|
|
days = getattr(settings, 'MIRRORLOG_RETENTION_PERIOD', 365)
|
|
|
|
removal_date = now() - timedelta(days=days)
|
|
|
|
logger.info("cleaning up older MirrorLog objects then %s", removal_date.strftime('%Y-%m-%d'))
|
|
|
|
MirrorLog.objects.filter(check_time__lt=removal_date).delete()
|
|
|
|
logger.info('Finished cleaning up old MirrorLog objects')
|
|
|
|
|
2010-09-16 14:45:30 -07:00
|
|
|
# vim: set ts=4 sw=4 et:
|