normality/diffscript.py
2024-07-06 21:55:38 -04:00

48 lines
1.7 KiB
Python

import pprint
import re
IGNORED_LINES = set(range(6180, 6198))
def readstring(f):
bytes = b''
while True:
byte = f.read(1)
if byte == b'\0':
break
bytes += byte
return bytes.decode('latin-1')
def parsedat(filename):
with open(filename, "rb") as f:
offsets = []
while f.tell() == 0 or f.tell() < offsets[0]:
offsets.append(int.from_bytes(f.read(4), 'little'))
lines = []
for offset in offsets:
if offset != 0:
f.seek(offset)
lines.append(readstring(f))
else:
lines.append('')
return lines
def dumplines(filename, lines1, lines2):
with open(filename, "wt") as f:
for line1, line2, lineno in zip(lines1, lines2, range(len(lines1))):
f.write(f"[{lineno}]\nUK: {line1}\nUS: {line2}\n")
def diffscore(line1, line2):
diffset = set(re.split(r"\W", line1.lower())) ^ set(re.split(r"\W", line2.lower()))
return sum(len(word) for word in diffset)
def scorelines(filename, lines1, lines2):
scored = [(diffscore(line1, line2), lineno, line1, line2) for line1, line2, lineno in zip(lines1, lines2, range(len(lines1))) if diffscore(line1, line2) > 0]
scored.sort(reverse=True, key=lambda tuple: tuple[0])
with open(filename, "wt") as f:
for score, lineno, line1, line2 in scored:
if lineno not in IGNORED_LINES:
f.write(f"[{lineno}] (diff score: {score})\nUK: {line1}\nUS: {line2}\n")
dumplines("all-subtitles-uk-us.txt", parsedat("lang.dat.uk"), parsedat("lang.dat.us"))
scorelines("diff-subtitles-uk-us.txt", parsedat("lang.dat.uk"), parsedat("lang.dat.us"))