office-gobmx/bin/extract-tooltip.py

#!/usr/bin/env python
import sys
import os
import re
import urlparse

def usage():
    message = """ usage: {program} inDir outDir
inDir: directory containing .ht files
outDir: target for the new files"""
    print(message.format(program = os.path.basename(sys.argv[0])))

def parseFile(filename):
    file = open(filename, "r")
    data = file.readlines()
    data = [line.rstrip('\n') for line in data]

    pairs = {}
    regEx = re.compile(r"^(\S+)\s(\S+)\s(\S+)\s((?:\s*\S*)+)$")
    old_line = None
    for line in data:
        if len(line) > 0:
            if(old_line != None):
                print filename
                #print("failed to parse line")
                #print(old_line)
                line = old_line + line
                print line
                old_line = None
            split_line = regEx.split(line)
            #print(split_line)
            #print(urlparse.unquote(split_line[2]))
            #print(split_line[4])
            if(old_line == None and split_line[4] == "" and split_line[3] != "0"):
                print(line)
                print(split_line)
                old_line = line
            else:
                pairs[urlparse.unquote(split_line[2])] = split_line[4]
                assert(len(split_line) == 6)
    #print data
    #print(pairs)
    return pairs

def parseFiles(dir):
    strings = []
    for files in os.listdir(dir):
        if files.endswith(".ht"):
            string = parseFile(os.path.join(dir,files))
            print(files)
            #print string
            strings.append([files, string])
    return strings

def extractSharedEntries(strings):
    first_dict = strings[0][1]
    shared_dict = {}
    #print(first_dict)
    for key, value in first_dict.iteritems():
        # check that the entry in the same in all dics
        is_in_all_dicts = True
        for dict_file_pair in strings:
            dict = dict_file_pair[1]
            if not dict.has_key(key):
                is_in_all_dicts = False
            elif not dict[key] == value:
                print("Element with different values")
                print(key)
                is_in_all_dicts = False
        if is_in_all_dicts:
            shared_dict[key] = value
    #print(shared_dict)
    for dict_file_pair in strings:
        for key in shared_dict.iterkeys():
            dict_file_pair[1].pop(key)

    strings.append(["shared.ht", shared_dict])
    return strings

def writeOutFiles(dir, strings):
    for string in strings:
        file_name_base = string[0]
        file_name_base = file_name_base.replace(".ht", ".properties")
        file_name = os.path.join(dir, file_name_base)
        file = open(file_name, "w")
        for key, value in string[1].iteritems():
            try:
                file.write(key)
                file.write("=")
                file.write(value)
                file.write("\n")
            except UnicodeDecodeError:
                print key
                print value
        file.close()

def main (args):
    if(len(args) != 3):
        usage()
        sys.exit(1)

    strings = parseFiles(args[1])
    new_strings = extractSharedEntries(strings)
    writeOutFiles(args[2], new_strings)

if __name__ == "__main__":
    main(sys.argv)