Initial Commit

master
Travis Burtrum 2016-04-20 02:08:54 -04:00
commit db5588f633
4 changed files with 2253 additions and 0 deletions

2048
english.txt Normal file

File diff suppressed because it is too large Load Diff

192
freespeech.py Executable file
View File

@ -0,0 +1,192 @@
#!/usr/bin/env python
"""
get wordlists from:
https://github.com/bitcoin/bips/blob/master/bip-0039/bip-0039-wordlists.md
http://wordnetcode.princeton.edu/3.0/WNdb-3.0.tar.gz
extract useable nouns from wordnetcode:
$ cut -d' ' -f 5 data.noun | grep -v _ | grep -v '/' | tr '[A-Z]' '[a-z]' | sort | uniq | wc -l
40980
"""
import sys, argparse
import math
from bitstring import ReadError, BitStream, pack
from collections import deque
class FreeSpeech:
def __init__(self, dict_filename, max_words_per_line=10, byte_buffer=65536):
self.verbose = True
(self.word_list, self.word_dict) = self.read_dict(dict_filename)
#print 'word_list:', self.word_list
#print 'word_dict:', self.word_dict
self.num_bits = self.min_even_bits(len(self.word_dict))
#print 'num_bits:', self.num_bits
self.int_type = 'uint:' + str(self.num_bits)
#print >> sys.stderr, 'int_type:', self.int_type
self.max_words_on_line = max_words_per_line
self.byte_buffer = byte_buffer
self.bit_buffer = self.byte_buffer * 8
self.word_count = 0
def max_int(self, min_bits):
return math.pow(2, min_bits)
def min_bits(self, max_int):
return math.log(max_int) / math.log(2)
def min_even_bits(self, max_int):
exact = self.min_bits(max_int)
#print >> sys.stderr, 'exact:', exact
floor = int(math.floor(exact))
if self.verbose and exact != floor:
print >> sys.stderr, 'There is no exact integer for min_bits, not all words will be used!'
return floor
def bits_to_unsigned_int(self, bits):
num = 0
index = 0
for bit in bits:
if bit == '1':
num += math.pow(2, index)
index += 1
return num
def file_to_in_stream(self, in_filename):
return sys.stdin if in_filename == '-' else open(in_filename, 'rb')
def file_to_out_stream(self, out_filename):
return sys.stdout if out_filename == '-' else open(out_filename, 'wb')
def decodeFiles(self, in_filename, out_filename):
with self.file_to_in_stream(in_filename) as in_stream, self.file_to_out_stream(out_filename) as out_stream:
self.decode(in_stream, out_stream)
def encodeFiles(self, in_filename, out_filename):
with self.file_to_in_stream(in_filename) as in_stream, self.file_to_out_stream(out_filename) as out_stream:
self.encode(in_stream, out_stream)
def decode(self, in_stream, out_stream):
bs = BitStream()
dq = deque()
at_least_three = False
for word in self.words_from_file(in_stream):
if not word or word not in self.word_dict:
continue
#print >> sys.stderr, 'word:"', word, '"'
dq.append(self.word_dict[word])
if at_least_three or len(dq) == 3:
bs.append(pack(self.int_type, dq.popleft()))
at_least_three = True
if bs.len > self.bit_buffer:
cut = 0
for byte in bs.cut(self.bit_buffer):
cut += 1
byte.tofile(out_stream)
del bs[:cut * self.bit_buffer]
# dq has to have exactly 2 elements here, the last is the bit length of the first, unless it's 0
#print >> sys.stderr, 'dq:', dq
extra_bits = dq.pop()
bs.append(pack('uint:' + str(extra_bits), dq.popleft()))
bs.tofile(out_stream)
def print_index(self, index, out_stream):
#print self.word_list[index],
out_stream.write(self.word_list[index])
self.word_count += 1
if self.word_count > self.max_words_on_line:
out_stream.write('\n')
self.word_count = 0
else:
out_stream.write(' ')
def encode(self, in_stream, out_stream):
extra_bits = self.num_bits
bs = BitStream()
try:
while True:
chunk = in_stream.read(self.byte_buffer)
#print >> sys.stderr, 'chunk:', chunk
if(chunk):
bs.append(BitStream(bytes=chunk))
else:
while True:
self.print_index(bs.read(self.int_type), out_stream)
try:
while True:
self.print_index(bs.read(self.int_type), out_stream)
except ReadError, e:
#print >> sys.stderr, 'inner:', e
pass
except ReadError, e:
#print >> sys.stderr, 'outer:', e
extra_bits = bs.len - bs.bitpos
if extra_bits > 0:
#print >> sys.stderr, 'extra_bits:', extra_bits
self.print_index(bs.read('uint:' + str(extra_bits)), out_stream)
else:
extra_bits = self.num_bits
# write extra_bits
self.print_index(extra_bits, out_stream)
def words_from_file(self, in_file):
for line in in_file:
#print 'line:', line
words = line.split(' ')
for word in words:
word = word.translate(None, '`~!@#$%^&*()-_=+[{]}\|\'";:/?.>,<\t\n\v\f\r').strip()
if word:
yield word
def remove_duplicates(self, values):
output = []
seen = set()
for value in values:
if value not in seen:
output.append(value)
seen.add(value)
return output
def read_dict(self, filename):
ret = []
with open(filename, 'r') as dict_file:
for word in self.words_from_file(dict_file):
ret.append(word.strip())
ret = self.remove_duplicates(ret)
index = 0
ret_dict = {}
for word in ret:
ret_dict[word] = index
index += 1
return (ret, ret_dict)
def main(argv=None):
parser = argparse.ArgumentParser(description='FreeSpeech encode or decode IN_FILE, or standard input, to OUT_FILE or standard output.')
parser.add_argument('-d', '--decode', dest='decode', action='store_true', help='decode data (default: encode data)')
parser.add_argument('-i', '--in', dest='in_file', default='-', help='input file (default: - (stdin))')
parser.add_argument('-o', '--out', dest='out_file', default='-', help='output file (default: - (stdout))')
parser.add_argument('-m', '--max-words-per-line', dest='max_words_per_line', type=int, default=10, help='maximum words to put on one line (default: 10)')
parser.add_argument('-b', '--byte-buffer', dest='byte_buffer', type=int, default=65536, help='size of byte buffer used when reading/writing files (default: 65536 (64MB))')
parser.add_argument('word_list', nargs=1, help='word list file to use, must use the same one for encoding/decoding')
args = parser.parse_args()
#print args
try:
fs = FreeSpeech(args.word_list[0], args.max_words_per_line, args.byte_buffer)
if args.decode:
fs.decodeFiles(args.in_file, args.out_file)
else:
fs.encodeFiles(args.in_file, args.out_file)
return 0
except:
return 1
if __name__ == '__main__':
sys.exit(main(sys.argv))

8
readme.md Normal file
View File

@ -0,0 +1,8 @@
FreeSpeech
----------
The idea here is to encode arbitrary data as human words taken from an arbitrary wordlist of arbitrary length.
Then, using the same wordlist, be able to decode the words back to the same data as before.
I will expand on the concept and techniques used later, for now, a working reference implementation can be found in
freespeech.py. A sample wordlist can be found in english.txt.

5
test.sh Executable file
View File

@ -0,0 +1,5 @@
#!/bin/bash
count=$1
bs=$2
dd if=/dev/urandom count=$count bs=$bs 2>/dev/null | tee >(md5sum 1>&2) >(./freespeech.py english.txt | ./freespeech.py -d english.txt | md5sum 1>&2) 1>/dev/null 2>&1 | cat
#dd if=/dev/urandom count=$count bs=$bs 2>/dev/null | tee >(md5sum 1>&2) >(base64 | base64 -di | md5sum 1>&2) 1>/dev/null 2>&1 | cat