dbe05a624c
Nothing left to do, just switch the interpreter. Change-Id: I6a0ff0a019a66f6baf39794c853655f273676ea2 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/161261 Tested-by: Thorsten Behrens <thorsten.behrens@allotropia.de> Reviewed-by: Thorsten Behrens <thorsten.behrens@allotropia.de>
483 lines
17 KiB
Python
Executable file
483 lines
17 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# This file is part of the LibreOffice project.
|
|
#
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
#
|
|
|
|
# This digs through a pile of bugzilla's and populates the cwd with a big
|
|
# collection of bug-docs in per-filetype dirs with bug-ids as names with
|
|
# prefixes to indicate which bug-tracker, e.g.
|
|
#
|
|
# fdo-bugid-X.suffix
|
|
# rhbz-bugid-X.suffix
|
|
# moz-bugid-X.suffix
|
|
#
|
|
# where X is the n'th attachment of that type in the bug
|
|
#
|
|
# The results are stored in the current directory, categorized by the
|
|
# extension of the downloaded file. When a file already exists, it is assumed
|
|
# it is already downloaded by a previous run, and up-to-date.
|
|
|
|
from __future__ import print_function
|
|
|
|
import base64
|
|
import datetime
|
|
import glob
|
|
import os
|
|
import os.path
|
|
import re
|
|
import stat
|
|
import sys
|
|
import threading
|
|
try:
|
|
import queue
|
|
except Exception:
|
|
import Queue as queue
|
|
try:
|
|
from urllib.request import urlopen
|
|
except Exception:
|
|
from urllib import urlopen
|
|
try:
|
|
import xmlrpc.client as xmlrpclib
|
|
except Exception:
|
|
import xmlrpclib
|
|
from xml.dom import minidom
|
|
from xml.sax.saxutils import escape
|
|
|
|
from attachment_mimetypes import mimetypes
|
|
|
|
import feedparser
|
|
|
|
|
|
def urlopen_retry(url):
|
|
"""Open url, retry 3 times."""
|
|
maxretries = 3
|
|
for i in range(maxretries + 1):
|
|
try:
|
|
return urlopen(url)
|
|
except IOError as e:
|
|
print('caught IOError: ' + str(e))
|
|
if maxretries == i:
|
|
raise
|
|
print('retrying...')
|
|
|
|
|
|
def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
|
|
"""Parse bug xml, download attachments with matching suffix."""
|
|
bugid = url.rsplit('=', 2)[1]
|
|
print('id is ' + prefix + bugid + ' ' + suffix)
|
|
print('parsing ' + bugid)
|
|
sock = urlopen_retry(url+'&ctype=xml')
|
|
dom = minidom.parse(sock)
|
|
sock.close()
|
|
attachmentid = 0
|
|
for attachment in dom.getElementsByTagName('attachment'):
|
|
attachmentid += 1
|
|
print(' mimetype is', end=' ')
|
|
for node in attachment.childNodes:
|
|
if node.nodeName == 'type':
|
|
# check if attachment is deleted
|
|
if not node.firstChild:
|
|
print('deleted attachment, skipping')
|
|
continue
|
|
|
|
print(node.firstChild.nodeValue, end=' ')
|
|
if node.firstChild.nodeValue.lower() != mimetype.lower():
|
|
print('skipping')
|
|
break
|
|
elif node.nodeName == 'data':
|
|
# check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
|
|
if not node.firstChild:
|
|
print('deleted attachment, skipping')
|
|
continue
|
|
|
|
download = (suffix + '/' + prefix + bugid + '-' +
|
|
str(attachmentid) + '.' + suffix)
|
|
if os.path.isfile(download):
|
|
print('assuming ' + download + ' is up to date')
|
|
continue
|
|
|
|
# prevent re-downloading FDO attachments from TDF
|
|
if prefix == 'tdf' and int(bugid) < 88776:
|
|
fdodownload = download.replace('tdf', 'fdo')
|
|
if os.path.isfile(fdodownload):
|
|
print('assuming FDO ' + fdodownload + ' is up to date')
|
|
continue
|
|
|
|
print('downloading as ' + download)
|
|
tmpfile = download + '.tmp'
|
|
f = open(tmpfile, 'wb')
|
|
f.write(base64.b64decode(node.firstChild.nodeValue))
|
|
f.close()
|
|
os.rename(tmpfile, download)
|
|
break
|
|
|
|
|
|
def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
|
|
"""Parse bug xml, download attachments with matching suffix."""
|
|
bugid = url.rsplit('=', 2)[1]
|
|
print('id is ' + prefix + bugid + ' ' + suffix)
|
|
print('parsing ' + bugid)
|
|
sock = urlopen_retry(url+'&ctype=xml')
|
|
dom = minidom.parse(sock)
|
|
sock.close()
|
|
attachmentid = 0
|
|
for comment in dom.getElementsByTagName('thetext'):
|
|
commentText = comment.firstChild.nodeValue
|
|
match = re.search(r'.*Created an attachment \(id=([0-9]+)\)',
|
|
commentText)
|
|
if not match:
|
|
continue
|
|
|
|
attachmentid += 1
|
|
|
|
download = (suffix + '/' + prefix + bugid + '-' +
|
|
str(attachmentid) + '.' + suffix)
|
|
if os.path.isfile(download):
|
|
print('assuming ' + download + ' is up to date')
|
|
continue
|
|
|
|
realAttachmentId = match.group(1)
|
|
handle = urlopen_retry(novellattach + realAttachmentId)
|
|
if not handle:
|
|
print('attachment ' + realAttachmentId + ' is not accessible')
|
|
continue
|
|
print(' mimetype is', end=' ')
|
|
|
|
info = handle.info()
|
|
if info.get_content_type:
|
|
remoteMime = info.get_content_type()
|
|
else:
|
|
remoteMime = info.gettype()
|
|
print(remoteMime, end=' ')
|
|
if remoteMime != mimetype:
|
|
print('skipping')
|
|
continue
|
|
|
|
print('downloading as ' + download)
|
|
tmpfile = download + '.tmp'
|
|
f = open(tmpfile, 'wb')
|
|
f.write(handle.read())
|
|
f.close()
|
|
os.rename(tmpfile, download)
|
|
|
|
|
|
def create_query(mimetype):
|
|
"""Query all bugs with suitable mimetype attachments."""
|
|
query = {}
|
|
query['query_format'] = 'advanced'
|
|
query['field0-0-0'] = 'attachments.mimetype'
|
|
query['type0-0-0'] = 'equals'
|
|
query['value0-0-0'] = mimetype
|
|
return query
|
|
|
|
|
|
def get_downloaded_files(prefix, suffix):
|
|
"""Generate list of existing downloads (matching pre/suffix)."""
|
|
return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix)))
|
|
|
|
|
|
def get_file_bz_ids(files, prefix):
|
|
"""Generate list of existing downloads (matching pre/suffix)."""
|
|
return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files])
|
|
|
|
|
|
def get_changed_date(files):
|
|
"""Compute date of last downloaded attachment."""
|
|
newest = max([os.stat(f)[stat.ST_MTIME] for f in files])
|
|
# Subtract a day to avoid timezone differences. The worst thing that
|
|
# can happen is that we are going to process more bugs than necessary.
|
|
return datetime.date.fromtimestamp(newest - 24 * 60 * 60)
|
|
|
|
|
|
def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix):
|
|
"""Poke Bugzilla via RPC query."""
|
|
try:
|
|
os.mkdir(suffix)
|
|
except Exception:
|
|
pass
|
|
|
|
def process(query, full, have=[]):
|
|
try:
|
|
proxy = xmlrpclib.ServerProxy(rpcurl)
|
|
result = proxy.Bug.search(query)
|
|
bugs = result['bugs']
|
|
print(str(len(bugs)) + ' bugs to process')
|
|
|
|
if full:
|
|
available = set([str(bug['id']) for bug in bugs])
|
|
# we already have files from all available bugs
|
|
if available.difference(set(have)) == set():
|
|
print('assuming all downloaded files are up to date')
|
|
return
|
|
|
|
for bug in bugs:
|
|
url = showurl + str(bug['id'])
|
|
get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
|
|
except xmlrpclib.Fault as err:
|
|
print('A fault occurred')
|
|
print('Fault code: ' + err.faultCode)
|
|
print(err.faultString)
|
|
|
|
query = create_query(mimetype)
|
|
query['column_list'] = 'bug_id'
|
|
|
|
files = get_downloaded_files(prefix, suffix)
|
|
|
|
if files != []:
|
|
print('looking for updated bugs having %s attachment(s)' % mimetype)
|
|
query_changed = query.copy()
|
|
query_changed['field0-1-0'] = 'days_elapsed'
|
|
query_changed['type0-1-0'] = 'lessthaneq'
|
|
query_changed['value0-1-0'] = str((datetime.date.today() - get_changed_date(files)).days)
|
|
process(query_changed, False)
|
|
|
|
print('looking for all bugs having %s attachment(s)' % mimetype)
|
|
process(query, True, get_file_bz_ids(files, prefix))
|
|
|
|
|
|
def get_through_rss_query(queryurl, mimetype, prefix, suffix):
|
|
"""Poke Bugzilla via RSS query."""
|
|
try:
|
|
os.mkdir(suffix)
|
|
except Exception:
|
|
pass
|
|
|
|
# Getting detailed bug information and downloading an attachment
|
|
# body is not possible without logging in to Novell bugzilla
|
|
# get_novell_bug_via_xml function is a workaround for that
|
|
# situation
|
|
get_bug_function = get_novell_bug_via_xml if prefix == 'novell' else get_from_bug_url_via_xml
|
|
|
|
def process(query, full, have=[]):
|
|
url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.items()])
|
|
print('url is ' + url)
|
|
d = feedparser.parse(url)
|
|
print(str(len(d['entries'])) + ' bugs to process')
|
|
|
|
entries = d['entries']
|
|
if full:
|
|
available = set([str(entry['id'].split('=')[-1]) for entry in entries])
|
|
# we already have files from all available bugs
|
|
if available.difference(set(have)) == set():
|
|
print('assuming all downloaded files are up to date')
|
|
return
|
|
|
|
for entry in entries:
|
|
try:
|
|
get_bug_function(entry['id'], mimetype, prefix, suffix)
|
|
except KeyboardInterrupt:
|
|
raise # Ctrl+C should work
|
|
except Exception:
|
|
print(entry['id'] + ' failed: ' + str(sys.exc_info()[0]))
|
|
pass
|
|
|
|
query = create_query(escape(mimetype.replace('+', '%2B')))
|
|
query['ctype'] = 'rss'
|
|
|
|
files = get_downloaded_files(prefix, suffix)
|
|
|
|
if files != []:
|
|
print('looking for updated bugs having %s attachment(s)' % mimetype)
|
|
query_changed = query.copy()
|
|
query_changed['field0-1-0'] = 'delta_ts'
|
|
query_changed['type0-1-0'] = 'greaterthaneq'
|
|
query_changed['value0-1-0'] = get_changed_date(files).isoformat()
|
|
process(query_changed, False)
|
|
|
|
print('looking for all bugs having %s attachment(s)' % mimetype)
|
|
process(query, True, get_file_bz_ids(files, prefix))
|
|
|
|
|
|
# since searching bugs having attachments with specific mimetypes is not
|
|
# available in launchpad API:
|
|
# we're iterating over all bugs of the most interesting source packages
|
|
launchpad_pkgs = (
|
|
'abiword',
|
|
'calibre',
|
|
'calligra',
|
|
'gnumeric',
|
|
'inkscape',
|
|
'koffice',
|
|
'libabw',
|
|
'libcdr',
|
|
'libe-book',
|
|
'libetonyek',
|
|
'libfreehand',
|
|
'libmspub',
|
|
'libmwaw',
|
|
'liborcus',
|
|
'libpagemaker',
|
|
'libreoffice',
|
|
'libvisio',
|
|
'libwpd',
|
|
'libwpg',
|
|
'libwps',
|
|
'openoffice.org',
|
|
'python-uniconvertor',
|
|
'scribus',
|
|
'sk1',
|
|
'unoconv',
|
|
)
|
|
|
|
|
|
def get_launchpad_bugs(prefix):
|
|
"""Query launchpad bugtracker (via launchpadlib)."""
|
|
# launchpadlib python module is required to download launchpad attachments
|
|
from launchpadlib.launchpad import Launchpad
|
|
|
|
launchpad = Launchpad.login_anonymously('attachmentdownload', 'production')
|
|
ubuntu = launchpad.distributions['ubuntu']
|
|
|
|
for pkg in launchpad_pkgs:
|
|
srcpkg = ubuntu.getSourcePackage(name=pkg)
|
|
pkgbugs = srcpkg.searchTasks(status=['New', 'Fix Committed', 'Invalid',
|
|
"Won't Fix", 'Confirmed',
|
|
'Triaged', 'In Progress',
|
|
'Incomplete',
|
|
'Incomplete (with response)',
|
|
'Incomplete (without response)',
|
|
'Fix Released', 'Opinion',
|
|
'Expired'])
|
|
|
|
for bugtask in pkgbugs:
|
|
bug = bugtask.bug
|
|
bugid = str(bug.id)
|
|
print('parsing ' + bugid + ' status: ' + bugtask.status +
|
|
' title: ' + bug.title[:50])
|
|
attachmentid = 0
|
|
for attachment in bug.attachments:
|
|
attachmentid += 1
|
|
handle = attachment.data.open()
|
|
if handle.content_type not in mimetypes:
|
|
#print "skipping"
|
|
continue
|
|
|
|
suffix = mimetypes[handle.content_type]
|
|
if not os.path.isdir(suffix):
|
|
try:
|
|
os.mkdir(suffix)
|
|
except Exception:
|
|
pass
|
|
|
|
download = (suffix + '/' + prefix + bugid + '-' +
|
|
str(attachmentid) + '.' + suffix)
|
|
|
|
if os.path.isfile(download):
|
|
print('assuming ' + bugid + ' is up to date')
|
|
break
|
|
|
|
print('mimetype is ' + handle.content_type +
|
|
' downloading as ' + download)
|
|
|
|
tmpfile = download + '.tmp'
|
|
f = open(tmpfile, 'wb')
|
|
f.write(handle.read())
|
|
f.close()
|
|
os.rename(tmpfile, download)
|
|
|
|
|
|
rss_bugzillas = (
|
|
# note: currently abisource has an expired TLS cert
|
|
# ('abi', 'http://bugzilla.abisource.com/buglist.cgi'), #added for abiword
|
|
('fdo', 'http://bugs.freedesktop.org/buglist.cgi'),
|
|
('gentoo', 'http://bugs.gentoo.org/buglist.cgi'),
|
|
# ('gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
|
|
('kde', 'http://bugs.kde.org/buglist.cgi'), # added for koffice/calligra
|
|
('mandriva', 'https://qa.mandriva.com/buglist.cgi'),
|
|
('moz', 'https://bugzilla.mozilla.org/buglist.cgi'),
|
|
# It seems something has changed and it is no longer possible to
|
|
# download any files from there.
|
|
# NOTE: I am leaving it in the list, commented out, just so someone
|
|
# does not add it back immediately .-)
|
|
# 'novell': 'https://bugzilla.novell.com/buglist.cgi',
|
|
# note: running this script against bz.apache.org apparently causes one's IP
|
|
# to be banned or something; you won't get new files in any case...
|
|
# ('ooo', 'https://bz.apache.org/ooo/buglist.cgi'),
|
|
('tdf', 'http://bugs.documentfoundation.org/buglist.cgi'),
|
|
)
|
|
|
|
redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi'
|
|
redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='
|
|
|
|
# Novell Bugzilla requires users to log in, in order to get details of
|
|
# the bugs such as attachment bodies etc. As a dirty workaround, we
|
|
# parse comments containing "Created an attachment (id=xxxxxx)" and
|
|
# download attachments manually python-bugzilla claims that it
|
|
# supports Novell bugzilla login but it's not working right now and
|
|
# novell bugzilla login system is a nightmare
|
|
novellattach = 'https://bugzilla.novell.com/attachment.cgi?id='
|
|
|
|
|
|
class manage_threads(threading.Thread):
|
|
def run(self):
|
|
while 1:
|
|
# Try to receive a job from queue
|
|
try:
|
|
# Get job from queue
|
|
# Use job parameters to call our query
|
|
# Then let the queue know we are done with this job
|
|
(uri, mimetype, prefix, extension) = jobs.get(True, 6)
|
|
try:
|
|
# set thread name for easier debugging, if process
|
|
# ctl package is available
|
|
import prctl
|
|
prctl.set_name(prefix[:3] + ': ' + mimetype[-10:])
|
|
except ImportError:
|
|
pass
|
|
|
|
try:
|
|
get_through_rss_query(uri, mimetype, prefix, extension)
|
|
finally:
|
|
jobs.task_done()
|
|
except KeyboardInterrupt:
|
|
raise # Ctrl+C should work
|
|
except queue.Empty:
|
|
break
|
|
|
|
|
|
def generate_multi_threading():
|
|
|
|
# Initialize threads
|
|
for _i in range(max_threads):
|
|
manage_threads().start()
|
|
|
|
for (prefix, uri) in rss_bugzillas:
|
|
|
|
# Create a job for every mimetype for a bugzilla
|
|
for (mimetype, extension) in mimetypes.items():
|
|
# It seems that bugzilla has problems returning that many results
|
|
# (10000 results is probably a limit set somewhere) so we always
|
|
# end processing the complete list.
|
|
if mimetype == 'text/html' and prefix == 'moz':
|
|
continue
|
|
|
|
jobs.put([uri, mimetype, prefix, extension], block=True)
|
|
print('successfully placed a job in the queue searching for ' +
|
|
mimetype + ' in bugtracker ' + prefix)
|
|
|
|
# Continue when all mimetypes are done for a bugzilla
|
|
print('STARTED all bugtracker ' + prefix)
|
|
|
|
jobs.join()
|
|
|
|
|
|
# Number of threads to create, (1 = without multi-threading, default = 20)
|
|
max_threads = int(os.environ.get('PARALLELISM', 20))
|
|
jobs = queue.Queue()
|
|
|
|
generate_multi_threading()
|
|
|
|
for (mimetype, extension) in mimetypes.items():
|
|
get_through_rpc_query(redhatrpc, redhatbug, mimetype, 'rhbz', extension)
|
|
|
|
try:
|
|
get_launchpad_bugs('lp')
|
|
except ImportError:
|
|
print('launchpadlib unavailable, skipping Ubuntu tracker')
|
|
|
|
# vim:set shiftwidth=4 softtabstop=4 expandtab:
|