normality/diffscript.py

import pprint
import re

IGNORED_LINES = set(range(6180, 6198))

def readstring(f):
    bytes = b''
    while True:
        byte = f.read(1)
        if byte == b'\0':
            break
        bytes += byte
    return bytes.decode('latin-1')

# lang.dat is encoded in a fairly simple format - a sequence of about 6000 4-byte little-endian integers,
# which correspond to offsets of strings within the file. This is followed by a series of null-terminated
# ASCII strings, which are pointed at by the offsets. Sometimes those offsets are 0; this corresponds to
# a missing string at that index.
def parsedat(filename):
    with open(filename, "rb") as f:
        offsets = []
        # we assume the first offset in the file corresponds to the place where the index ends and strings begin
        while f.tell() == 0 or f.tell() < offsets[0]:
            offsets.append(int.from_bytes(f.read(4), 'little'))
        lines = []
        for offset in offsets:
            if offset != 0:
                f.seek(offset)
                lines.append(readstring(f))
            else:
                lines.append('')
        return lines

def dumplines(filename, lines1, lines2):
    with open(filename, "wt") as f:
        for line1, line2, lineno in zip(lines1, lines2, range(len(lines1))):
            f.write(f"[{lineno}]\nUK: {line1}\nUS: {line2}\n")

def diffscore(line1, line2):
    diffset = set(re.split(r"\W", line1.lower())) ^ set(re.split(r"\W", line2.lower()))
    return sum(len(word) for word in diffset)

def scorelines(filename, lines1, lines2):
    scored = [(diffscore(line1, line2), lineno, line1, line2) for line1, line2, lineno in zip(lines1, lines2, range(len(lines1))) if diffscore(line1, line2) > 0]
    scored.sort(reverse=True, key=lambda tuple: tuple[0])
    with open(filename, "wt") as f:
        for score, lineno, line1, line2 in scored:
            if lineno not in IGNORED_LINES:
                f.write(f"[{lineno}] (diff score: {score})\nUK: {line1}\nUS: {line2}\n")

dumplines("all-subtitles-uk-us.txt", parsedat("lang.dat.uk"), parsedat("lang.dat.us"))
scorelines("diff-subtitles-uk-us.txt", parsedat("lang.dat.uk"), parsedat("lang.dat.us"))