2024-07-07 01:55:38 +00:00
|
|
|
import pprint
|
|
|
|
import re
|
|
|
|
|
|
|
|
IGNORED_LINES = set(range(6180, 6198))
|
|
|
|
|
|
|
|
def readstring(f):
|
|
|
|
bytes = b''
|
|
|
|
while True:
|
|
|
|
byte = f.read(1)
|
|
|
|
if byte == b'\0':
|
|
|
|
break
|
|
|
|
bytes += byte
|
|
|
|
return bytes.decode('latin-1')
|
|
|
|
|
2024-07-07 02:26:31 +00:00
|
|
|
# lang.dat is encoded in a fairly simple format - a sequence of about 6000 4-byte little-endian integers,
|
|
|
|
# which correspond to offsets of strings within the file. This is followed by a series of null-terminated
|
|
|
|
# ASCII strings, which are pointed at by the offsets. Sometimes those offsets are 0; this corresponds to
|
|
|
|
# a missing string at that index.
|
2024-07-07 01:55:38 +00:00
|
|
|
def parsedat(filename):
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
offsets = []
|
2024-07-07 02:26:31 +00:00
|
|
|
# we assume the first offset in the file corresponds to the place where the index ends and strings begin
|
2024-07-07 01:55:38 +00:00
|
|
|
while f.tell() == 0 or f.tell() < offsets[0]:
|
|
|
|
offsets.append(int.from_bytes(f.read(4), 'little'))
|
|
|
|
lines = []
|
|
|
|
for offset in offsets:
|
|
|
|
if offset != 0:
|
|
|
|
f.seek(offset)
|
|
|
|
lines.append(readstring(f))
|
|
|
|
else:
|
|
|
|
lines.append('')
|
|
|
|
return lines
|
|
|
|
|
|
|
|
def dumplines(filename, lines1, lines2):
|
|
|
|
with open(filename, "wt") as f:
|
|
|
|
for line1, line2, lineno in zip(lines1, lines2, range(len(lines1))):
|
|
|
|
f.write(f"[{lineno}]\nUK: {line1}\nUS: {line2}\n")
|
|
|
|
|
2024-07-07 03:53:36 +00:00
|
|
|
def words_from_line(line):
|
|
|
|
return re.split(r"\W", line.lower())
|
|
|
|
|
2024-07-07 01:55:38 +00:00
|
|
|
def diffscore(line1, line2):
|
2024-07-07 03:53:36 +00:00
|
|
|
diffset = set(words_from_line(line1)) ^ set(words_from_line(line2))
|
2024-07-07 01:55:38 +00:00
|
|
|
return sum(len(word) for word in diffset)
|
|
|
|
|
|
|
|
def scorelines(filename, lines1, lines2):
|
|
|
|
scored = [(diffscore(line1, line2), lineno, line1, line2) for line1, line2, lineno in zip(lines1, lines2, range(len(lines1))) if diffscore(line1, line2) > 0]
|
|
|
|
scored.sort(reverse=True, key=lambda tuple: tuple[0])
|
|
|
|
with open(filename, "wt") as f:
|
|
|
|
for score, lineno, line1, line2 in scored:
|
|
|
|
if lineno not in IGNORED_LINES:
|
|
|
|
f.write(f"[{lineno}] (diff score: {score})\nUK: {line1}\nUS: {line2}\n")
|
|
|
|
|
2024-07-07 03:53:36 +00:00
|
|
|
class WordCount:
|
|
|
|
def __init__(self):
|
|
|
|
self.words = {}
|
|
|
|
|
|
|
|
def ensure(self, word):
|
|
|
|
if word not in self.words:
|
|
|
|
self.words[word] = 0
|
|
|
|
|
|
|
|
def add(self, word, count=1):
|
|
|
|
self.ensure(word)
|
|
|
|
self.words[word] += count
|
|
|
|
|
|
|
|
def remove(self, word):
|
|
|
|
self.add(word, count=-1)
|
|
|
|
|
|
|
|
def merge(self, wordcount, pred):
|
|
|
|
for word, count in wordcount.words.items():
|
|
|
|
if pred(count):
|
|
|
|
self.add(word, count)
|
|
|
|
|
|
|
|
def sortedTuples(self, pred, descending=False):
|
|
|
|
return sorted(((count, word) for word, count in self.words.items() if pred(count, word)), reverse=descending)
|
|
|
|
|
|
|
|
def wordfrequencies(filename, lines1, lines2):
|
|
|
|
removedWords = WordCount()
|
|
|
|
addedWords = WordCount()
|
|
|
|
|
|
|
|
for line1, line2 in zip(lines1, lines2):
|
|
|
|
linecount = WordCount()
|
|
|
|
for word in words_from_line(line1):
|
|
|
|
linecount.remove(word)
|
|
|
|
for word in words_from_line(line2):
|
|
|
|
linecount.add(word)
|
|
|
|
removedWords.merge(linecount, lambda c: c < 0)
|
|
|
|
addedWords.merge(linecount, lambda c: c > 0)
|
|
|
|
|
|
|
|
with open(filename, "wt") as f:
|
|
|
|
f.write("Words added to lines in US version:\n")
|
|
|
|
for count, word in addedWords.sortedTuples(lambda c, w: len(w) > 1 and c > 0, descending=True):
|
|
|
|
f.write(f"{word}: {count}\n")
|
|
|
|
f.write("\nWords removed from lines in US version:\n")
|
|
|
|
for count, word in removedWords.sortedTuples(lambda c, w: len(w) > 1 and c < 0):
|
|
|
|
f.write(f"{word}: {count}\n")
|
|
|
|
|
|
|
|
uk_lines = parsedat("lang.dat.uk")
|
|
|
|
us_lines = parsedat('lang.dat.us')
|
|
|
|
dumplines("all-subtitles-uk-us.txt", uk_lines, us_lines)
|
|
|
|
scorelines("diff-subtitles-uk-us.txt", uk_lines, us_lines)
|
|
|
|
wordfrequencies("word-frequency-analysis.txt", uk_lines, us_lines)
|