office-gobmx/bin/get-bugzilla-attachments-by-mimetype

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#

# This digs through a pile of bugzilla's and populates the cwd with a big
# collection of bug-docs in per-filetype dirs with bug-ids as names with
# prefixes to indicate which bug-tracker, e.g.
#
# fdo-bugid-X.suffix
# rhbz-bugid-X.suffix
# moz-bugid-X.suffix
#
# where X is the n'th attachment of that type in the bug
#
# The results are stored in the current directory, categorized by the
# extension of the downloaded file.  When a file already exists, it is assumed
# it is already downloaded by a previous run, and up-to-date.

from __future__ import print_function

import base64
import datetime
import glob
import os
import os.path
import re
import stat
import sys
import threading
try:
    import queue
except Exception:
    import Queue as queue
try:
    from urllib.request import urlopen
except Exception:
    from urllib import urlopen
try:
    import xmlrpc.client as xmlrpclib
except Exception:
    import xmlrpclib
from xml.dom import minidom
from xml.sax.saxutils import escape

from attachment_mimetypes import mimetypes

import feedparser


def urlopen_retry(url):
    """Open url, retry 3 times."""
    maxretries = 3
    for i in range(maxretries + 1):
        try:
            return urlopen(url)
        except IOError as e:
            print('caught IOError: ' + str(e))
            if maxretries == i:
                raise
            print('retrying...')


def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
    """Parse bug xml, download attachments with matching suffix."""
    bugid = url.rsplit('=', 2)[1]
    print('id is ' + prefix + bugid + ' ' + suffix)
    print('parsing ' + bugid)
    sock = urlopen_retry(url+'&ctype=xml')
    dom = minidom.parse(sock)
    sock.close()
    attachmentid = 0
    for attachment in dom.getElementsByTagName('attachment'):
        attachmentid += 1
        print(' mimetype is', end=' ')
        for node in attachment.childNodes:
            if node.nodeName == 'type':
                # check if attachment is deleted
                if not node.firstChild:
                    print('deleted attachment, skipping')
                    continue

                print(node.firstChild.nodeValue, end=' ')
                if node.firstChild.nodeValue.lower() != mimetype.lower():
                    print('skipping')
                    break
            elif node.nodeName == 'data':
                # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
                if not node.firstChild:
                    print('deleted attachment, skipping')
                    continue

                download = (suffix + '/' + prefix + bugid + '-' +
                            str(attachmentid) + '.' + suffix)
                if os.path.isfile(download):
                    print('assuming ' + download + ' is up to date')
                    continue

                # prevent re-downloading FDO attachments from TDF
                if prefix == 'tdf' and int(bugid) < 88776:
                    fdodownload = download.replace('tdf', 'fdo')
                    if os.path.isfile(fdodownload):
                        print('assuming FDO ' + fdodownload + ' is up to date')
                        continue

                print('downloading as ' + download)
                tmpfile = download + '.tmp'
                f = open(tmpfile, 'wb')
                f.write(base64.b64decode(node.firstChild.nodeValue))
                f.close()
                os.rename(tmpfile, download)
                break


def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
    """Parse bug xml, download attachments with matching suffix."""
    bugid = url.rsplit('=', 2)[1]
    print('id is ' + prefix + bugid + ' ' + suffix)
    print('parsing ' + bugid)
    sock = urlopen_retry(url+'&ctype=xml')
    dom = minidom.parse(sock)
    sock.close()
    attachmentid = 0
    for comment in dom.getElementsByTagName('thetext'):
        commentText = comment.firstChild.nodeValue
        match = re.search(r'.*Created an attachment \(id=([0-9]+)\)',
                          commentText)
        if not match:
            continue

        attachmentid += 1

        download = (suffix + '/' + prefix + bugid + '-' +
                    str(attachmentid) + '.' + suffix)
        if os.path.isfile(download):
            print('assuming ' + download + ' is up to date')
            continue

        realAttachmentId = match.group(1)
        handle = urlopen_retry(novellattach + realAttachmentId)
        if not handle:
            print('attachment ' + realAttachmentId + ' is not accessible')
            continue
        print(' mimetype is', end=' ')

        info = handle.info()
        if info.get_content_type:
            remoteMime = info.get_content_type()
        else:
            remoteMime = info.gettype()
        print(remoteMime, end=' ')
        if remoteMime != mimetype:
            print('skipping')
            continue

        print('downloading as ' + download)
        tmpfile = download + '.tmp'
        f = open(tmpfile, 'wb')
        f.write(handle.read())
        f.close()
        os.rename(tmpfile, download)


def create_query(mimetype):
    """Query all bugs with suitable mimetype attachments."""
    query = {}
    query['query_format'] = 'advanced'
    query['field0-0-0'] = 'attachments.mimetype'
    query['type0-0-0'] = 'equals'
    query['value0-0-0'] = mimetype
    return query


def get_downloaded_files(prefix, suffix):
    """Generate list of existing downloads (matching pre/suffix)."""
    return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix)))


def get_file_bz_ids(files, prefix):
    """Generate list of existing downloads (matching pre/suffix)."""
    return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files])


def get_changed_date(files):
    """Compute date of last downloaded attachment."""
    newest = max([os.stat(f)[stat.ST_MTIME] for f in files])
    # Subtract a day to avoid timezone differences. The worst thing that
    # can happen is that we are going to process more bugs than necessary.
    return datetime.date.fromtimestamp(newest - 24 * 60 * 60)


def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix):
    """Poke Bugzilla via RPC query."""
    try:
        os.mkdir(suffix)
    except Exception:
        pass

    def process(query, full, have=[]):
        try:
            proxy = xmlrpclib.ServerProxy(rpcurl)
            result = proxy.Bug.search(query)
            bugs = result['bugs']
            print(str(len(bugs)) + ' bugs to process')

            if full:
                available = set([str(bug['id']) for bug in bugs])
                # we already have files from all available bugs
                if available.difference(set(have)) == set():
                    print('assuming all downloaded files are up to date')
                    return

            for bug in bugs:
                url = showurl + str(bug['id'])
                get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
        except xmlrpclib.Fault as err:
            print('A fault occurred')
            print('Fault code: ' + err.faultCode)
            print(err.faultString)

    query = create_query(mimetype)
    query['column_list'] = 'bug_id'

    files = get_downloaded_files(prefix, suffix)

    if files != []:
        print('looking for updated bugs having %s attachment(s)' % mimetype)
        query_changed = query.copy()
        query_changed['field0-1-0'] = 'days_elapsed'
        query_changed['type0-1-0'] = 'lessthaneq'
        query_changed['value0-1-0'] = str((datetime.date.today() - get_changed_date(files)).days)
        process(query_changed, False)

    print('looking for all bugs having %s attachment(s)' % mimetype)
    process(query, True, get_file_bz_ids(files, prefix))


def get_through_rss_query(queryurl, mimetype, prefix, suffix):
    """Poke Bugzilla via RSS query."""
    try:
        os.mkdir(suffix)
    except Exception:
        pass

    # Getting detailed bug information and downloading an attachment
    # body is not possible without logging in to Novell bugzilla
    # get_novell_bug_via_xml function is a workaround for that
    # situation
    get_bug_function = get_novell_bug_via_xml if prefix == 'novell' else get_from_bug_url_via_xml

    def process(query, full, have=[]):
        url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.items()])
        print('url is ' + url)
        d = feedparser.parse(url)
        print(str(len(d['entries'])) + ' bugs to process')

        entries = d['entries']
        if full:
            available = set([str(entry['id'].split('=')[-1]) for entry in entries])
            # we already have files from all available bugs
            if available.difference(set(have)) == set():
                print('assuming all downloaded files are up to date')
                return

        for entry in entries:
            try:
                get_bug_function(entry['id'], mimetype, prefix, suffix)
            except KeyboardInterrupt:
                raise # Ctrl+C should work
            except Exception:
                print(entry['id'] + ' failed: ' + str(sys.exc_info()[0]))
                pass

    query = create_query(escape(mimetype.replace('+', '%2B')))
    query['ctype'] = 'rss'

    files = get_downloaded_files(prefix, suffix)

    if files != []:
        print('looking for updated bugs having %s attachment(s)' % mimetype)
        query_changed = query.copy()
        query_changed['field0-1-0'] = 'delta_ts'
        query_changed['type0-1-0'] = 'greaterthaneq'
        query_changed['value0-1-0'] = get_changed_date(files).isoformat()
        process(query_changed, False)

    print('looking for all bugs having %s attachment(s)' % mimetype)
    process(query, True, get_file_bz_ids(files, prefix))


# since searching bugs having attachments with specific mimetypes is not
# available in launchpad API:
# we're iterating over all bugs of the most interesting source packages
launchpad_pkgs = (
    'abiword',
    'calibre',
    'calligra',
    'gnumeric',
    'inkscape',
    'koffice',
    'libabw',
    'libcdr',
    'libe-book',
    'libetonyek',
    'libfreehand',
    'libmspub',
    'libmwaw',
    'liborcus',
    'libpagemaker',
    'libreoffice',
    'libvisio',
    'libwpd',
    'libwpg',
    'libwps',
    'openoffice.org',
    'python-uniconvertor',
    'scribus',
    'sk1',
    'unoconv',
)


def get_launchpad_bugs(prefix):
    """Query launchpad bugtracker (via launchpadlib)."""
    # launchpadlib python module is required to download launchpad attachments
    from launchpadlib.launchpad import Launchpad

    launchpad = Launchpad.login_anonymously('attachmentdownload', 'production')
    ubuntu = launchpad.distributions['ubuntu']

    for pkg in launchpad_pkgs:
        srcpkg = ubuntu.getSourcePackage(name=pkg)
        pkgbugs = srcpkg.searchTasks(status=['New', 'Fix Committed', 'Invalid',
                                             "Won't Fix", 'Confirmed',
                                             'Triaged', 'In Progress',
                                             'Incomplete',
                                             'Incomplete (with response)',
                                             'Incomplete (without response)',
                                             'Fix Released', 'Opinion',
                                             'Expired'])

        for bugtask in pkgbugs:
            bug = bugtask.bug
            bugid = str(bug.id)
            print('parsing ' + bugid + ' status: ' + bugtask.status +
                  ' title: ' + bug.title[:50])
            attachmentid = 0
            for attachment in bug.attachments:
                attachmentid += 1
                handle = attachment.data.open()
                if handle.content_type not in mimetypes:
                    #print "skipping"
                    continue

                suffix = mimetypes[handle.content_type]
                if not os.path.isdir(suffix):
                    try:
                        os.mkdir(suffix)
                    except Exception:
                        pass

                download = (suffix + '/' + prefix + bugid + '-' +
                            str(attachmentid) + '.' + suffix)

                if os.path.isfile(download):
                    print('assuming ' + bugid + ' is up to date')
                    break

                print('mimetype is ' + handle.content_type +
                      ' downloading as ' + download)

                tmpfile = download + '.tmp'
                f = open(tmpfile, 'wb')
                f.write(handle.read())
                f.close()
                os.rename(tmpfile, download)


rss_bugzillas = (
# note: currently abisource has an expired TLS cert
#    ('abi', 'http://bugzilla.abisource.com/buglist.cgi'), #added for abiword
    ('fdo', 'http://bugs.freedesktop.org/buglist.cgi'),
    ('gentoo', 'http://bugs.gentoo.org/buglist.cgi'),
#    ('gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
    ('kde', 'http://bugs.kde.org/buglist.cgi'), # added for koffice/calligra
    ('mandriva', 'https://qa.mandriva.com/buglist.cgi'),
    ('moz', 'https://bugzilla.mozilla.org/buglist.cgi'),
    # It seems something has changed and it is no longer possible to
    # download any files from there.
    # NOTE: I am leaving it in the list, commented out, just so someone
    # does not add it back immediately .-)
    # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
# note: running this script against bz.apache.org apparently causes one's IP
# to be banned or something; you won't get new files in any case...
#    ('ooo', 'https://bz.apache.org/ooo/buglist.cgi'),
    ('tdf', 'http://bugs.documentfoundation.org/buglist.cgi'),
)

redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi'
redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='

# Novell Bugzilla requires users to log in, in order to get details of
# the bugs such as attachment bodies etc.  As a dirty workaround, we
# parse comments containing "Created an attachment (id=xxxxxx)" and
# download attachments manually python-bugzilla claims that it
# supports Novell bugzilla login but it's not working right now and
# novell bugzilla login system is a nightmare
novellattach = 'https://bugzilla.novell.com/attachment.cgi?id='


class manage_threads(threading.Thread):
    def run(self):
        while 1:
            # Try to receive a job from queue
            try:
                # Get job from queue
                # Use job parameters to call our query
                # Then let the queue know we are done with this job
                (uri, mimetype, prefix, extension) = jobs.get(True, 6)
                try:
                    # set thread name for easier debugging, if process
                    # ctl package is available
                    import prctl
                    prctl.set_name(prefix[:3] + ': ' + mimetype[-10:])
                except ImportError:
                    pass

                try:
                    get_through_rss_query(uri, mimetype, prefix, extension)
                finally:
                    jobs.task_done()
            except KeyboardInterrupt:
                raise # Ctrl+C should work
            except queue.Empty:
                break


def generate_multi_threading():

    # Initialize threads
    for _i in range(max_threads):
        manage_threads().start()

    for (prefix, uri) in rss_bugzillas:

        # Create a job for every mimetype for a bugzilla
        for (mimetype, extension) in mimetypes.items():
            # It seems that bugzilla has problems returning that many results
            # (10000 results is probably a limit set somewhere) so we always
            # end processing the complete list.
            if mimetype == 'text/html' and prefix == 'moz':
                continue

            jobs.put([uri, mimetype, prefix, extension], block=True)
            print('successfully placed a job in the queue searching for ' +
                  mimetype + ' in bugtracker ' + prefix)

        # Continue when all mimetypes are done for a bugzilla
        print('STARTED all bugtracker ' + prefix)

    jobs.join()


# Number of threads to create, (1 = without multi-threading, default = 20)
max_threads = int(os.environ.get('PARALLELISM', 20))
jobs = queue.Queue()

generate_multi_threading()

for (mimetype, extension) in mimetypes.items():
    get_through_rpc_query(redhatrpc, redhatbug, mimetype, 'rhbz', extension)

try:
    get_launchpad_bugs('lp')
except ImportError:
    print('launchpadlib unavailable, skipping Ubuntu tracker')

# vim:set shiftwidth=4 softtabstop=4 expandtab: