2022-05-13 08:30:23 -05:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
# This file is part of the LibreOffice project.
|
|
|
|
#
|
|
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
|
|
|
|
# Use this script to retrieve information from https://crashreport.libreoffice.org
|
|
|
|
# about a specific version of LibreOffice
|
2022-06-16 06:16:13 -05:00
|
|
|
# Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/
|
2022-05-13 08:30:23 -05:00
|
|
|
|
2022-06-16 06:16:13 -05:00
|
|
|
import argparse
|
2022-05-13 08:30:23 -05:00
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import sys
|
|
|
|
import os
|
|
|
|
import math
|
|
|
|
from datetime import datetime
|
2022-06-21 08:40:33 -05:00
|
|
|
import urllib.parse
|
2022-05-13 08:30:23 -05:00
|
|
|
|
|
|
|
def convert_str_to_date(value):
|
|
|
|
value = value.replace('.', '')
|
|
|
|
value = value.replace('March', 'Mar')
|
|
|
|
value = value.replace('April', 'Apr')
|
|
|
|
value = value.replace('June', 'Jun')
|
|
|
|
value = value.replace('July', 'Jul')
|
|
|
|
value = value.replace('Sept', 'Sep')
|
2022-05-27 13:25:13 -05:00
|
|
|
# reset the time leaving the date
|
|
|
|
value = ", ".join(value.split(", ")[:-1])
|
|
|
|
dtDate = datetime.strptime(value, '%b %d, %Y')
|
2022-05-13 08:30:23 -05:00
|
|
|
|
2022-05-27 13:25:13 -05:00
|
|
|
return dtDate.strftime('%y/%m/%d')
|
2022-05-13 08:30:23 -05:00
|
|
|
|
|
|
|
def parse_version_url(url):
|
|
|
|
crashReports = {}
|
2022-05-31 04:08:21 -05:00
|
|
|
|
|
|
|
try:
|
|
|
|
html_text = requests.get(url, timeout=200).text
|
|
|
|
soup = BeautifulSoup(html_text, 'html.parser')
|
|
|
|
except requests.exceptions.Timeout:
|
|
|
|
print("Timeout requesting " + url)
|
|
|
|
sys.exit(1)
|
2022-05-13 08:30:23 -05:00
|
|
|
|
|
|
|
table = soup.find("table", {"id": "data-table"}).tbody
|
|
|
|
for tr in table.find_all("tr"):
|
|
|
|
td_list = tr.find_all("td")
|
|
|
|
crashName = td_list[0].a.text.strip()
|
|
|
|
crashNumber = int(td_list[1].text.strip())
|
|
|
|
firstCrashDate = convert_str_to_date(td_list[5].text.strip())
|
|
|
|
lastCrashDate = convert_str_to_date(td_list[6].text.strip())
|
|
|
|
crashReports[crashName] = [crashNumber, firstCrashDate, lastCrashDate]
|
|
|
|
|
|
|
|
return crashReports
|
|
|
|
|
|
|
|
def parse_reports_and_get_most_recent_report_from_last_page(url):
|
2022-05-31 04:08:21 -05:00
|
|
|
try:
|
|
|
|
html_text = requests.get(url, timeout=200).text
|
|
|
|
soup = BeautifulSoup(html_text, 'html.parser')
|
|
|
|
except requests.exceptions.Timeout:
|
|
|
|
print("Timeout")
|
|
|
|
raise
|
2022-05-13 08:30:23 -05:00
|
|
|
|
|
|
|
count = 0
|
|
|
|
os_tab = soup.find("table", {"id": "os_tab"}).tbody
|
|
|
|
tr_list = os_tab.find_all("tr")
|
|
|
|
for tr in tr_list:
|
|
|
|
td_list = tr.find_all("td")
|
|
|
|
count += int(td_list[1].text.strip())
|
|
|
|
|
|
|
|
# There are 50 reports on each page.
|
|
|
|
# Go to the last page based on the total count to get a recent report
|
|
|
|
last_page = math.ceil( count / 50 )
|
|
|
|
|
|
|
|
if last_page > 1:
|
|
|
|
url = url + "?page=" + str(last_page)
|
2022-05-31 04:08:21 -05:00
|
|
|
try:
|
|
|
|
html_text = requests.get(url, timeout=200).text
|
|
|
|
soup = BeautifulSoup(html_text, 'html.parser')
|
|
|
|
except requests.exceptions.Timeout:
|
|
|
|
print("Timeout")
|
|
|
|
raise
|
2022-05-13 08:30:23 -05:00
|
|
|
|
|
|
|
reports = soup.find("div", {"id": "reports"}).tbody
|
|
|
|
ID, currentID = "", ""
|
|
|
|
version, currentVersion = "", ""
|
|
|
|
OS, currentOS = "", ""
|
|
|
|
|
|
|
|
tr_list = reports.find_all("tr")
|
|
|
|
for tr in tr_list:
|
|
|
|
td_list = tr.find_all("td")
|
|
|
|
|
|
|
|
currentID = td_list[0].a.text.strip()
|
|
|
|
currentVersion = td_list[2].text.strip().split(': ')[1]
|
|
|
|
currentOS = td_list[3].text.strip()
|
|
|
|
|
|
|
|
# get most recent version
|
|
|
|
# symbols on linux are not very informative generally
|
|
|
|
if currentOS == "windows" and currentVersion > version:
|
|
|
|
version = currentVersion
|
|
|
|
ID = currentID
|
|
|
|
OS = currentOS
|
|
|
|
|
|
|
|
if not version:
|
|
|
|
version = currentVersion
|
|
|
|
|
|
|
|
if not ID:
|
|
|
|
ID = currentID
|
|
|
|
|
|
|
|
if not OS:
|
|
|
|
OS = currentOS
|
|
|
|
|
|
|
|
return count, ID, version, OS
|
|
|
|
|
2022-05-30 13:56:41 -05:00
|
|
|
def parse_details_and_get_info(url, gitRepo):
|
2022-05-31 04:08:21 -05:00
|
|
|
try:
|
|
|
|
html_text = requests.get(url, timeout=200).text
|
|
|
|
soup = BeautifulSoup(html_text, 'html.parser')
|
|
|
|
except requests.exceptions.Timeout:
|
|
|
|
print("Timeout")
|
|
|
|
raise
|
2022-05-13 08:30:23 -05:00
|
|
|
|
|
|
|
details = soup.find("div", {"id": "details"}).tbody
|
|
|
|
tr_list = details.find_all("tr")
|
|
|
|
reason = tr_list[8].td.text.strip()
|
|
|
|
|
|
|
|
stack = ""
|
2022-05-30 13:56:41 -05:00
|
|
|
codeLine = ""
|
|
|
|
|
2022-05-13 08:30:23 -05:00
|
|
|
count = 0
|
|
|
|
frames = soup.find("div", {"id": "frames"}).tbody
|
|
|
|
for tr in frames.find_all("tr"):
|
|
|
|
td_list = tr.find_all("td")
|
|
|
|
source = td_list[3].text.strip()
|
|
|
|
if source and count <= 10:
|
|
|
|
source = source.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "")
|
|
|
|
stack += source + "\n"
|
|
|
|
count += 1
|
|
|
|
|
2022-05-30 13:56:41 -05:00
|
|
|
codeFile = source.split(":")[0]
|
|
|
|
codeNumber = source.split(":")[1]
|
|
|
|
try:
|
|
|
|
with open(os.path.join(gitRepo, codeFile)) as f:
|
|
|
|
lines = f.readlines()
|
|
|
|
for index, line in enumerate(lines):
|
|
|
|
if index + 1 == int(codeNumber):
|
|
|
|
codeLine += line.strip() + "\n"
|
|
|
|
except FileNotFoundError:
|
|
|
|
codeLine += "\n"
|
|
|
|
continue
|
|
|
|
|
2022-05-13 08:30:23 -05:00
|
|
|
if stack:
|
|
|
|
#multiline
|
|
|
|
stack = "\"" + stack + "\""
|
2022-05-30 13:56:41 -05:00
|
|
|
|
|
|
|
if codeLine:
|
|
|
|
#multiline
|
|
|
|
codeLine = "\"" + codeLine + "\""
|
|
|
|
|
|
|
|
return reason, stack, codeLine
|
2022-05-13 08:30:23 -05:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
2022-06-16 06:16:13 -05:00
|
|
|
parser = argparse.ArgumentParser()
|
2022-05-13 08:30:23 -05:00
|
|
|
|
2022-06-16 06:16:13 -05:00
|
|
|
parser.add_argument('--version', action='store', dest="version", required=True)
|
|
|
|
parser.add_argument('--repository', action="store", dest="repository", required=True)
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
2022-05-13 08:30:23 -05:00
|
|
|
|
2022-06-16 06:16:13 -05:00
|
|
|
crashes = parse_version_url(
|
|
|
|
"https://crashreport.libreoffice.org/stats/version/" + args.version + "?limit=1000&days=30")
|
2022-05-30 13:56:41 -05:00
|
|
|
|
2022-06-16 06:16:13 -05:00
|
|
|
print(str(len(crashes)) + " crash reports in version " + args.version)
|
2022-05-13 08:30:23 -05:00
|
|
|
|
|
|
|
crashesInFile = []
|
2022-06-16 06:16:13 -05:00
|
|
|
fileName = "crashes_" + args.version.replace(".", "_") + ".csv"
|
2022-05-13 08:30:23 -05:00
|
|
|
print("Using " + fileName)
|
|
|
|
|
|
|
|
bInsertHeader = False
|
|
|
|
if os.path.exists(fileName):
|
|
|
|
with open(fileName, "r") as f:
|
|
|
|
lines = f.readlines()
|
|
|
|
for line in lines:
|
|
|
|
crashesInFile.append(line.split("\t")[0])
|
|
|
|
else:
|
|
|
|
bInsertHeader = True
|
|
|
|
|
|
|
|
with open(fileName, "a") as f:
|
|
|
|
if bInsertHeader:
|
|
|
|
line = '\t'.join(["Name", "Count", "First report", "Last Report",
|
2022-05-30 13:56:41 -05:00
|
|
|
"ID", "Version", "Reason", "OS", "Stack", "Code Lines" '\n'])
|
2022-05-13 08:30:23 -05:00
|
|
|
f.write(line)
|
|
|
|
f.flush()
|
|
|
|
|
2022-05-27 13:25:13 -05:00
|
|
|
for k, lDate in crashes.items():
|
2022-06-21 08:40:33 -05:00
|
|
|
if k not in crashesInFile:
|
2022-05-13 08:30:23 -05:00
|
|
|
print("Parsing " + k)
|
2022-05-31 04:08:21 -05:00
|
|
|
try:
|
|
|
|
crashCount, crashID, crashVersion, crashOS = parse_reports_and_get_most_recent_report_from_last_page(
|
2022-06-21 08:40:33 -05:00
|
|
|
"https://crashreport.libreoffice.org/stats/signature/" + urllib.parse.quote(k))
|
2022-05-31 04:08:21 -05:00
|
|
|
crashReason, crashStack, codeLine = parse_details_and_get_info(
|
2022-06-16 06:16:13 -05:00
|
|
|
"https://crashreport.libreoffice.org/stats/crash_details/" + crashID, args.repository)
|
2022-05-27 13:25:13 -05:00
|
|
|
line = '\t'.join([k, str(crashCount), lDate[1], lDate[2],
|
2022-05-31 04:08:21 -05:00
|
|
|
crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, '\n'])
|
|
|
|
f.write(line)
|
|
|
|
f.flush()
|
|
|
|
except requests.exceptions.Timeout:
|
|
|
|
continue
|