import pprint import re IGNORED_LINES = set(range(6180, 6198)) def readstring(f): bytes = b'' while True: byte = f.read(1) if byte == b'\0': break bytes += byte return bytes.decode('latin-1') # lang.dat is encoded in a fairly simple format - a sequence of about 6000 4-byte little-endian integers, # which correspond to offsets of strings within the file. This is followed by a series of null-terminated # ASCII strings, which are pointed at by the offsets. Sometimes those offsets are 0; this corresponds to # a missing string at that index. def parsedat(filename): with open(filename, "rb") as f: offsets = [] # we assume the first offset in the file corresponds to the place where the index ends and strings begin while f.tell() == 0 or f.tell() < offsets[0]: offsets.append(int.from_bytes(f.read(4), 'little')) lines = [] for offset in offsets: if offset != 0: f.seek(offset) lines.append(readstring(f)) else: lines.append('') return lines def dumplines(filename, lines1, lines2): with open(filename, "wt") as f: for line1, line2, lineno in zip(lines1, lines2, range(len(lines1))): f.write(f"[{lineno}]\nUK: {line1}\nUS: {line2}\n") def words_from_line(line): return re.split(r"\W", line.lower()) def diffscore(line1, line2): diffset = set(words_from_line(line1)) ^ set(words_from_line(line2)) return sum(len(word) for word in diffset) def scorelines(filename, lines1, lines2): scored = [(diffscore(line1, line2), lineno, line1, line2) for line1, line2, lineno in zip(lines1, lines2, range(len(lines1))) if diffscore(line1, line2) > 0] scored.sort(reverse=True, key=lambda tuple: tuple[0]) with open(filename, "wt") as f: for score, lineno, line1, line2 in scored: if lineno not in IGNORED_LINES: f.write(f"[{lineno}] (diff score: {score})\nUK: {line1}\nUS: {line2}\n") class WordCount: def __init__(self): self.words = {} def ensure(self, word): if word not in self.words: self.words[word] = 0 def add(self, word, count=1): self.ensure(word) self.words[word] += count def remove(self, word): self.add(word, count=-1) def merge(self, wordcount, pred): for word, count in wordcount.words.items(): if pred(count): self.add(word, count) def sortedTuples(self, pred, descending=False): return sorted(((count, word) for word, count in self.words.items() if pred(count, word)), reverse=descending) def wordfrequencies(filename, lines1, lines2): removedWords = WordCount() addedWords = WordCount() for line1, line2 in zip(lines1, lines2): linecount = WordCount() for word in words_from_line(line1): linecount.remove(word) for word in words_from_line(line2): linecount.add(word) removedWords.merge(linecount, lambda c: c < 0) addedWords.merge(linecount, lambda c: c > 0) with open(filename, "wt") as f: f.write("Words added to lines in US version:\n") for count, word in addedWords.sortedTuples(lambda c, w: len(w) > 1 and c > 0, descending=True): f.write(f"{word}: {count}\n") f.write("\nWords removed from lines in US version:\n") for count, word in removedWords.sortedTuples(lambda c, w: len(w) > 1 and c < 0): f.write(f"{word}: {count}\n") uk_lines = parsedat("lang.dat.uk") us_lines = parsedat('lang.dat.us') dumplines("all-subtitles-uk-us.txt", uk_lines, us_lines) scorelines("diff-subtitles-uk-us.txt", uk_lines, us_lines) wordfrequencies("word-frequency-analysis.txt", uk_lines, us_lines)