normality/diffscript.py

import pprint
import re

IGNORED_LINES = set(range(6180, 6198))

def readstring(f):
    bytes = b''
    while True:
        byte = f.read(1)
        if byte == b'\0':
            break
        bytes += byte
    return bytes.decode('latin-1')

# lang.dat is encoded in a fairly simple format - a sequence of about 6000 4-byte little-endian integers, 
# which correspond to offsets of strings within the file. This is followed by a series of null-terminated 
# ASCII strings, which are pointed at by the offsets. Sometimes those offsets are 0; this corresponds to 
# a missing string at that index.
def parsedat(filename):
    with open(filename, "rb") as f:
        offsets = []
        # we assume the first offset in the file corresponds to the place where the index ends and strings begin
        while f.tell() == 0 or f.tell() < offsets[0]:
            offsets.append(int.from_bytes(f.read(4), 'little'))
        lines = []
        for offset in offsets:
            if offset != 0:
                f.seek(offset)
                lines.append(readstring(f))
            else:
                lines.append('')
        return lines

def dumplines(filename, lines1, lines2):
    with open(filename, "wt") as f:
        for line1, line2, lineno in zip(lines1, lines2, range(len(lines1))):
            f.write(f"[{lineno}]\nUK: {line1}\nUS: {line2}\n")

def words_from_line(line):
    return re.split(r"\W", line.lower())

def diffscore(line1, line2):
    diffset = set(words_from_line(line1)) ^ set(words_from_line(line2))
    return sum(len(word) for word in diffset)

def scorelines(filename, lines1, lines2):
    scored = [(diffscore(line1, line2), lineno, line1, line2) for line1, line2, lineno in zip(lines1, lines2, range(len(lines1))) if diffscore(line1, line2) > 0]
    scored.sort(reverse=True, key=lambda tuple: tuple[0])
    with open(filename, "wt") as f:
        for score, lineno, line1, line2 in scored:
            if lineno not in IGNORED_LINES:
                f.write(f"[{lineno}] (diff score: {score})\nUK: {line1}\nUS: {line2}\n")

class WordCount:
    def __init__(self):
        self.words = {}
    
    def ensure(self, word):
        if word not in self.words:
            self.words[word] = 0

    def add(self, word, count=1):
        self.ensure(word)
        self.words[word] += count
    
    def remove(self, word):
        self.add(word, count=-1)
    
    def merge(self, wordcount, pred):
        for word, count in wordcount.words.items():
            if pred(count):
                self.add(word, count)
    
    def sortedTuples(self, pred, descending=False):
        return sorted(((count, word) for word, count in self.words.items() if pred(count, word)), reverse=descending)

def wordfrequencies(filename, lines1, lines2):
    removedWords = WordCount()
    addedWords = WordCount()

    for line1, line2 in zip(lines1, lines2):
        linecount = WordCount()
        for word in words_from_line(line1):
            linecount.remove(word)
        for word in words_from_line(line2):
            linecount.add(word)
        removedWords.merge(linecount, lambda c: c < 0)
        addedWords.merge(linecount, lambda c: c > 0)
    
    with open(filename, "wt") as f:
        f.write("Words added to lines in US version:\n")
        for count, word in addedWords.sortedTuples(lambda c, w: len(w) > 1 and c > 0, descending=True):
            f.write(f"{word}: {count}\n")
        f.write("\nWords removed from lines in US version:\n")
        for count, word in removedWords.sortedTuples(lambda c, w: len(w) > 1 and c < 0):
            f.write(f"{word}: {count}\n")

uk_lines = parsedat("lang.dat.uk")
us_lines = parsedat('lang.dat.us')
dumplines("all-subtitles-uk-us.txt", uk_lines, us_lines)
scorelines("diff-subtitles-uk-us.txt", uk_lines, us_lines)
wordfrequencies("word-frequency-analysis.txt", uk_lines, us_lines)
Initial Checkin 2024-07-07 01:55:38 +00:00			`import pprint`
			`import re`

			`IGNORED_LINES = set(range(6180, 6198))`

			`def readstring(f):`
			`bytes = b''`
			`while True:`
			`byte = f.read(1)`
			`if byte == b'\0':`
			`break`
			`bytes += byte`
			`return bytes.decode('latin-1')`

Light editing, move tech details to the Python code 2024-07-07 02:26:31 +00:00			`# lang.dat is encoded in a fairly simple format - a sequence of about 6000 4-byte little-endian integers,`
			`# which correspond to offsets of strings within the file. This is followed by a series of null-terminated`
			`# ASCII strings, which are pointed at by the offsets. Sometimes those offsets are 0; this corresponds to`
			`# a missing string at that index.`
Initial Checkin 2024-07-07 01:55:38 +00:00			`def parsedat(filename):`
			`with open(filename, "rb") as f:`
			`offsets = []`
Light editing, move tech details to the Python code 2024-07-07 02:26:31 +00:00			`# we assume the first offset in the file corresponds to the place where the index ends and strings begin`
Initial Checkin 2024-07-07 01:55:38 +00:00			`while f.tell() == 0 or f.tell() < offsets[0]:`
			`offsets.append(int.from_bytes(f.read(4), 'little'))`
			`lines = []`
			`for offset in offsets:`
			`if offset != 0:`
			`f.seek(offset)`
			`lines.append(readstring(f))`
			`else:`
			`lines.append('')`
			`return lines`

			`def dumplines(filename, lines1, lines2):`
			`with open(filename, "wt") as f:`
			`for line1, line2, lineno in zip(lines1, lines2, range(len(lines1))):`
			`f.write(f"[{lineno}]\nUK: {line1}\nUS: {line2}\n")`

Add word frequency analysis 2024-07-07 03:53:36 +00:00			`def words_from_line(line):`
			`return re.split(r"\W", line.lower())`

Initial Checkin 2024-07-07 01:55:38 +00:00			`def diffscore(line1, line2):`
Add word frequency analysis 2024-07-07 03:53:36 +00:00			`diffset = set(words_from_line(line1)) ^ set(words_from_line(line2))`
Initial Checkin 2024-07-07 01:55:38 +00:00			`return sum(len(word) for word in diffset)`

			`def scorelines(filename, lines1, lines2):`
			`scored = [(diffscore(line1, line2), lineno, line1, line2) for line1, line2, lineno in zip(lines1, lines2, range(len(lines1))) if diffscore(line1, line2) > 0]`
			`scored.sort(reverse=True, key=lambda tuple: tuple[0])`
			`with open(filename, "wt") as f:`
			`for score, lineno, line1, line2 in scored:`
			`if lineno not in IGNORED_LINES:`
			`f.write(f"[{lineno}] (diff score: {score})\nUK: {line1}\nUS: {line2}\n")`

Add word frequency analysis 2024-07-07 03:53:36 +00:00			`class WordCount:`
			`def __init__(self):`
			`self.words = {}`

			`def ensure(self, word):`
			`if word not in self.words:`
			`self.words[word] = 0`

			`def add(self, word, count=1):`
			`self.ensure(word)`
			`self.words[word] += count`

			`def remove(self, word):`
			`self.add(word, count=-1)`

			`def merge(self, wordcount, pred):`
			`for word, count in wordcount.words.items():`
			`if pred(count):`
			`self.add(word, count)`

			`def sortedTuples(self, pred, descending=False):`
			`return sorted(((count, word) for word, count in self.words.items() if pred(count, word)), reverse=descending)`

			`def wordfrequencies(filename, lines1, lines2):`
			`removedWords = WordCount()`
			`addedWords = WordCount()`

			`for line1, line2 in zip(lines1, lines2):`
			`linecount = WordCount()`
			`for word in words_from_line(line1):`
			`linecount.remove(word)`
			`for word in words_from_line(line2):`
			`linecount.add(word)`
			`removedWords.merge(linecount, lambda c: c < 0)`
			`addedWords.merge(linecount, lambda c: c > 0)`

			`with open(filename, "wt") as f:`
			`f.write("Words added to lines in US version:\n")`
			`for count, word in addedWords.sortedTuples(lambda c, w: len(w) > 1 and c > 0, descending=True):`
			`f.write(f"{word}: {count}\n")`
			`f.write("\nWords removed from lines in US version:\n")`
			`for count, word in removedWords.sortedTuples(lambda c, w: len(w) > 1 and c < 0):`
			`f.write(f"{word}: {count}\n")`

			`uk_lines = parsedat("lang.dat.uk")`
			`us_lines = parsedat('lang.dat.us')`
			`dumplines("all-subtitles-uk-us.txt", uk_lines, us_lines)`
			`scorelines("diff-subtitles-uk-us.txt", uk_lines, us_lines)`
			`wordfrequencies("word-frequency-analysis.txt", uk_lines, us_lines)`