Initial Commit

2024-12-21 15:08:53 -05:00 · 2016-04-20 02:08:54 -04:00 · 2016-04-20 02:08:54 -04:00 · db5588f633
commit db5588f633
4 changed files with 2253 additions and 0 deletions
--- a/english.txt
+++ b/english.txt
--- a/freespeech.py
+++ b/freespeech.py
@ -0,0 +1,192 @@
+#!/usr/bin/env python
+
+"""
+
+get wordlists from:
+
+https://github.com/bitcoin/bips/blob/master/bip-0039/bip-0039-wordlists.md
+http://wordnetcode.princeton.edu/3.0/WNdb-3.0.tar.gz
+
+extract useable nouns from wordnetcode:
+$ cut -d' ' -f 5 data.noun | grep -v _ | grep -v '/' | tr '[A-Z]' '[a-z]' | sort | uniq | wc -l
+40980
+
+"""
+
+import sys, argparse
+import math
+from bitstring import ReadError, BitStream, pack
+from collections import deque
+
+class FreeSpeech:
+    def __init__(self, dict_filename, max_words_per_line=10, byte_buffer=65536):
+        self.verbose = True
+        (self.word_list, self.word_dict) = self.read_dict(dict_filename)
+        #print 'word_list:', self.word_list
+        #print 'word_dict:', self.word_dict
+        self.num_bits = self.min_even_bits(len(self.word_dict))
+        #print 'num_bits:', self.num_bits
+        self.int_type = 'uint:' + str(self.num_bits)
+        #print >> sys.stderr, 'int_type:', self.int_type
+        self.max_words_on_line = max_words_per_line
+        self.byte_buffer = byte_buffer
+        self.bit_buffer = self.byte_buffer * 8
+        self.word_count = 0
+
+    def max_int(self, min_bits):
+        return math.pow(2, min_bits)
+
+    def min_bits(self, max_int):
+        return math.log(max_int) / math.log(2)
+
+    def min_even_bits(self, max_int):
+        exact = self.min_bits(max_int)
+        #print >> sys.stderr, 'exact:', exact
+        floor = int(math.floor(exact))
+        if self.verbose and exact != floor:
+            print >> sys.stderr, 'There is no exact integer for min_bits, not all words will be used!'
+        return floor
+
+    def bits_to_unsigned_int(self, bits):
+        num = 0
+        index = 0
+        for bit in bits:
+            if bit == '1':
+                num += math.pow(2, index)
+            index += 1
+        return num
+
+    def file_to_in_stream(self, in_filename):
+        return sys.stdin if in_filename == '-' else open(in_filename, 'rb')
+
+    def file_to_out_stream(self, out_filename):
+        return sys.stdout if out_filename == '-' else open(out_filename, 'wb')
+
+    def decodeFiles(self, in_filename, out_filename):
+        with self.file_to_in_stream(in_filename) as in_stream, self.file_to_out_stream(out_filename) as out_stream:
+            self.decode(in_stream, out_stream)
+
+    def encodeFiles(self, in_filename, out_filename):
+        with self.file_to_in_stream(in_filename) as in_stream, self.file_to_out_stream(out_filename) as out_stream:
+            self.encode(in_stream, out_stream)
+
+    def decode(self, in_stream, out_stream):
+        bs = BitStream()
+        dq = deque()
+        at_least_three = False
+        for word in self.words_from_file(in_stream):
+            if not word or word not in self.word_dict:
+                continue
+            #print >> sys.stderr, 'word:"', word, '"'
+            dq.append(self.word_dict[word])
+            if at_least_three or len(dq) == 3:
+                bs.append(pack(self.int_type, dq.popleft()))
+                at_least_three = True
+                if bs.len > self.bit_buffer:
+                    cut = 0
+                    for byte in bs.cut(self.bit_buffer):
+                        cut += 1
+                        byte.tofile(out_stream)
+                    del bs[:cut * self.bit_buffer]
+
+        # dq has to have exactly 2 elements here, the last is the bit length of the first, unless it's 0
+        #print >> sys.stderr, 'dq:', dq
+        extra_bits = dq.pop()
+        bs.append(pack('uint:' + str(extra_bits), dq.popleft()))
+
+        bs.tofile(out_stream)
+
+    def print_index(self, index, out_stream):
+        #print self.word_list[index],
+        out_stream.write(self.word_list[index])
+        self.word_count += 1
+        if self.word_count > self.max_words_on_line:
+            out_stream.write('\n')
+            self.word_count = 0
+        else:
+            out_stream.write(' ')
+
+    def encode(self, in_stream, out_stream):
+        extra_bits = self.num_bits
+        bs = BitStream()
+        try:
+            while True:
+                chunk = in_stream.read(self.byte_buffer)
+                #print >> sys.stderr, 'chunk:', chunk
+                if(chunk):
+                    bs.append(BitStream(bytes=chunk))
+                else:
+                    while True:
+                        self.print_index(bs.read(self.int_type), out_stream)
+                try:
+                    while True:
+                        self.print_index(bs.read(self.int_type), out_stream)
+                except ReadError, e:
+                    #print >> sys.stderr, 'inner:', e
+                    pass
+        except ReadError, e:
+            #print >> sys.stderr, 'outer:', e
+            extra_bits = bs.len - bs.bitpos
+            if extra_bits > 0:
+                #print >> sys.stderr, 'extra_bits:', extra_bits
+                self.print_index(bs.read('uint:' + str(extra_bits)), out_stream)
+            else:
+                extra_bits = self.num_bits
+        # write extra_bits
+        self.print_index(extra_bits, out_stream)
+
+    def words_from_file(self, in_file):
+        for line in in_file:
+            #print 'line:', line
+            words = line.split(' ')
+            for word in words:
+                word = word.translate(None, '`~!@#$%^&*()-_=+[{]}\|\'";:/?.>,<\t\n\v\f\r').strip()
+                if word:
+                    yield word
+
+    def remove_duplicates(self, values):
+        output = []
+        seen = set()
+        for value in values:
+            if value not in seen:
+                output.append(value)
+                seen.add(value)
+        return output
+
+    def read_dict(self, filename):
+        ret = []
+        with open(filename, 'r') as dict_file:
+            for word in self.words_from_file(dict_file):
+                ret.append(word.strip())
+        ret = self.remove_duplicates(ret)
+        index = 0
+        ret_dict = {}
+        for word in ret:
+            ret_dict[word] = index
+            index += 1
+        return (ret, ret_dict)
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(description='FreeSpeech encode or decode IN_FILE, or standard input, to OUT_FILE or standard output.')
+    parser.add_argument('-d', '--decode', dest='decode', action='store_true', help='decode data (default: encode data)')
+    parser.add_argument('-i', '--in', dest='in_file', default='-', help='input file (default: - (stdin))')
+    parser.add_argument('-o', '--out', dest='out_file', default='-', help='output file (default: - (stdout))')
+    parser.add_argument('-m', '--max-words-per-line', dest='max_words_per_line', type=int, default=10, help='maximum words to put on one line (default: 10)')
+    parser.add_argument('-b', '--byte-buffer', dest='byte_buffer', type=int, default=65536, help='size of byte buffer used when reading/writing files (default: 65536 (64MB))')
+    parser.add_argument('word_list', nargs=1, help='word list file to use, must use the same one for encoding/decoding')
+
+    args = parser.parse_args()
+    #print args
+
+    try:
+        fs = FreeSpeech(args.word_list[0], args.max_words_per_line, args.byte_buffer)
+        if args.decode:
+            fs.decodeFiles(args.in_file, args.out_file)
+        else:
+            fs.encodeFiles(args.in_file, args.out_file)
+        return 0
+    except:
+        return 1
+
+if __name__ == '__main__':
+    sys.exit(main(sys.argv))
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,8 @@
+FreeSpeech
+----------
+
+The idea here is to encode arbitrary data as human words taken from an arbitrary wordlist of arbitrary length.
+Then, using the same wordlist, be able to decode the words back to the same data as before.
+
+I will expand on the concept and techniques used later, for now, a working reference implementation can be found in
+freespeech.py.  A sample wordlist can be found in english.txt.
--- a/test.sh
+++ b/test.sh
@ -0,0 +1,5 @@
+#!/bin/bash
+count=$1
+bs=$2
+dd if=/dev/urandom count=$count bs=$bs 2>/dev/null | tee >(md5sum 1>&2) >(./freespeech.py english.txt | ./freespeech.py -d english.txt | md5sum 1>&2) 1>/dev/null 2>&1 | cat
+#dd if=/dev/urandom count=$count bs=$bs 2>/dev/null | tee >(md5sum 1>&2) >(base64 | base64 -di | md5sum 1>&2) 1>/dev/null 2>&1 | cat