mirror of
https://github.com/moparisthebest/freespeech
synced 2024-11-21 16:35:02 -05:00
Initial Commit
This commit is contained in:
commit
db5588f633
2048
english.txt
Normal file
2048
english.txt
Normal file
File diff suppressed because it is too large
Load Diff
192
freespeech.py
Executable file
192
freespeech.py
Executable file
@ -0,0 +1,192 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
get wordlists from:
|
||||||
|
|
||||||
|
https://github.com/bitcoin/bips/blob/master/bip-0039/bip-0039-wordlists.md
|
||||||
|
http://wordnetcode.princeton.edu/3.0/WNdb-3.0.tar.gz
|
||||||
|
|
||||||
|
extract useable nouns from wordnetcode:
|
||||||
|
$ cut -d' ' -f 5 data.noun | grep -v _ | grep -v '/' | tr '[A-Z]' '[a-z]' | sort | uniq | wc -l
|
||||||
|
40980
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys, argparse
|
||||||
|
import math
|
||||||
|
from bitstring import ReadError, BitStream, pack
|
||||||
|
from collections import deque
|
||||||
|
|
||||||
|
class FreeSpeech:
|
||||||
|
def __init__(self, dict_filename, max_words_per_line=10, byte_buffer=65536):
|
||||||
|
self.verbose = True
|
||||||
|
(self.word_list, self.word_dict) = self.read_dict(dict_filename)
|
||||||
|
#print 'word_list:', self.word_list
|
||||||
|
#print 'word_dict:', self.word_dict
|
||||||
|
self.num_bits = self.min_even_bits(len(self.word_dict))
|
||||||
|
#print 'num_bits:', self.num_bits
|
||||||
|
self.int_type = 'uint:' + str(self.num_bits)
|
||||||
|
#print >> sys.stderr, 'int_type:', self.int_type
|
||||||
|
self.max_words_on_line = max_words_per_line
|
||||||
|
self.byte_buffer = byte_buffer
|
||||||
|
self.bit_buffer = self.byte_buffer * 8
|
||||||
|
self.word_count = 0
|
||||||
|
|
||||||
|
def max_int(self, min_bits):
|
||||||
|
return math.pow(2, min_bits)
|
||||||
|
|
||||||
|
def min_bits(self, max_int):
|
||||||
|
return math.log(max_int) / math.log(2)
|
||||||
|
|
||||||
|
def min_even_bits(self, max_int):
|
||||||
|
exact = self.min_bits(max_int)
|
||||||
|
#print >> sys.stderr, 'exact:', exact
|
||||||
|
floor = int(math.floor(exact))
|
||||||
|
if self.verbose and exact != floor:
|
||||||
|
print >> sys.stderr, 'There is no exact integer for min_bits, not all words will be used!'
|
||||||
|
return floor
|
||||||
|
|
||||||
|
def bits_to_unsigned_int(self, bits):
|
||||||
|
num = 0
|
||||||
|
index = 0
|
||||||
|
for bit in bits:
|
||||||
|
if bit == '1':
|
||||||
|
num += math.pow(2, index)
|
||||||
|
index += 1
|
||||||
|
return num
|
||||||
|
|
||||||
|
def file_to_in_stream(self, in_filename):
|
||||||
|
return sys.stdin if in_filename == '-' else open(in_filename, 'rb')
|
||||||
|
|
||||||
|
def file_to_out_stream(self, out_filename):
|
||||||
|
return sys.stdout if out_filename == '-' else open(out_filename, 'wb')
|
||||||
|
|
||||||
|
def decodeFiles(self, in_filename, out_filename):
|
||||||
|
with self.file_to_in_stream(in_filename) as in_stream, self.file_to_out_stream(out_filename) as out_stream:
|
||||||
|
self.decode(in_stream, out_stream)
|
||||||
|
|
||||||
|
def encodeFiles(self, in_filename, out_filename):
|
||||||
|
with self.file_to_in_stream(in_filename) as in_stream, self.file_to_out_stream(out_filename) as out_stream:
|
||||||
|
self.encode(in_stream, out_stream)
|
||||||
|
|
||||||
|
def decode(self, in_stream, out_stream):
|
||||||
|
bs = BitStream()
|
||||||
|
dq = deque()
|
||||||
|
at_least_three = False
|
||||||
|
for word in self.words_from_file(in_stream):
|
||||||
|
if not word or word not in self.word_dict:
|
||||||
|
continue
|
||||||
|
#print >> sys.stderr, 'word:"', word, '"'
|
||||||
|
dq.append(self.word_dict[word])
|
||||||
|
if at_least_three or len(dq) == 3:
|
||||||
|
bs.append(pack(self.int_type, dq.popleft()))
|
||||||
|
at_least_three = True
|
||||||
|
if bs.len > self.bit_buffer:
|
||||||
|
cut = 0
|
||||||
|
for byte in bs.cut(self.bit_buffer):
|
||||||
|
cut += 1
|
||||||
|
byte.tofile(out_stream)
|
||||||
|
del bs[:cut * self.bit_buffer]
|
||||||
|
|
||||||
|
# dq has to have exactly 2 elements here, the last is the bit length of the first, unless it's 0
|
||||||
|
#print >> sys.stderr, 'dq:', dq
|
||||||
|
extra_bits = dq.pop()
|
||||||
|
bs.append(pack('uint:' + str(extra_bits), dq.popleft()))
|
||||||
|
|
||||||
|
bs.tofile(out_stream)
|
||||||
|
|
||||||
|
def print_index(self, index, out_stream):
|
||||||
|
#print self.word_list[index],
|
||||||
|
out_stream.write(self.word_list[index])
|
||||||
|
self.word_count += 1
|
||||||
|
if self.word_count > self.max_words_on_line:
|
||||||
|
out_stream.write('\n')
|
||||||
|
self.word_count = 0
|
||||||
|
else:
|
||||||
|
out_stream.write(' ')
|
||||||
|
|
||||||
|
def encode(self, in_stream, out_stream):
|
||||||
|
extra_bits = self.num_bits
|
||||||
|
bs = BitStream()
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
chunk = in_stream.read(self.byte_buffer)
|
||||||
|
#print >> sys.stderr, 'chunk:', chunk
|
||||||
|
if(chunk):
|
||||||
|
bs.append(BitStream(bytes=chunk))
|
||||||
|
else:
|
||||||
|
while True:
|
||||||
|
self.print_index(bs.read(self.int_type), out_stream)
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
self.print_index(bs.read(self.int_type), out_stream)
|
||||||
|
except ReadError, e:
|
||||||
|
#print >> sys.stderr, 'inner:', e
|
||||||
|
pass
|
||||||
|
except ReadError, e:
|
||||||
|
#print >> sys.stderr, 'outer:', e
|
||||||
|
extra_bits = bs.len - bs.bitpos
|
||||||
|
if extra_bits > 0:
|
||||||
|
#print >> sys.stderr, 'extra_bits:', extra_bits
|
||||||
|
self.print_index(bs.read('uint:' + str(extra_bits)), out_stream)
|
||||||
|
else:
|
||||||
|
extra_bits = self.num_bits
|
||||||
|
# write extra_bits
|
||||||
|
self.print_index(extra_bits, out_stream)
|
||||||
|
|
||||||
|
def words_from_file(self, in_file):
|
||||||
|
for line in in_file:
|
||||||
|
#print 'line:', line
|
||||||
|
words = line.split(' ')
|
||||||
|
for word in words:
|
||||||
|
word = word.translate(None, '`~!@#$%^&*()-_=+[{]}\|\'";:/?.>,<\t\n\v\f\r').strip()
|
||||||
|
if word:
|
||||||
|
yield word
|
||||||
|
|
||||||
|
def remove_duplicates(self, values):
|
||||||
|
output = []
|
||||||
|
seen = set()
|
||||||
|
for value in values:
|
||||||
|
if value not in seen:
|
||||||
|
output.append(value)
|
||||||
|
seen.add(value)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def read_dict(self, filename):
|
||||||
|
ret = []
|
||||||
|
with open(filename, 'r') as dict_file:
|
||||||
|
for word in self.words_from_file(dict_file):
|
||||||
|
ret.append(word.strip())
|
||||||
|
ret = self.remove_duplicates(ret)
|
||||||
|
index = 0
|
||||||
|
ret_dict = {}
|
||||||
|
for word in ret:
|
||||||
|
ret_dict[word] = index
|
||||||
|
index += 1
|
||||||
|
return (ret, ret_dict)
|
||||||
|
|
||||||
|
def main(argv=None):
|
||||||
|
parser = argparse.ArgumentParser(description='FreeSpeech encode or decode IN_FILE, or standard input, to OUT_FILE or standard output.')
|
||||||
|
parser.add_argument('-d', '--decode', dest='decode', action='store_true', help='decode data (default: encode data)')
|
||||||
|
parser.add_argument('-i', '--in', dest='in_file', default='-', help='input file (default: - (stdin))')
|
||||||
|
parser.add_argument('-o', '--out', dest='out_file', default='-', help='output file (default: - (stdout))')
|
||||||
|
parser.add_argument('-m', '--max-words-per-line', dest='max_words_per_line', type=int, default=10, help='maximum words to put on one line (default: 10)')
|
||||||
|
parser.add_argument('-b', '--byte-buffer', dest='byte_buffer', type=int, default=65536, help='size of byte buffer used when reading/writing files (default: 65536 (64MB))')
|
||||||
|
parser.add_argument('word_list', nargs=1, help='word list file to use, must use the same one for encoding/decoding')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
#print args
|
||||||
|
|
||||||
|
try:
|
||||||
|
fs = FreeSpeech(args.word_list[0], args.max_words_per_line, args.byte_buffer)
|
||||||
|
if args.decode:
|
||||||
|
fs.decodeFiles(args.in_file, args.out_file)
|
||||||
|
else:
|
||||||
|
fs.encodeFiles(args.in_file, args.out_file)
|
||||||
|
return 0
|
||||||
|
except:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main(sys.argv))
|
8
readme.md
Normal file
8
readme.md
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
FreeSpeech
|
||||||
|
----------
|
||||||
|
|
||||||
|
The idea here is to encode arbitrary data as human words taken from an arbitrary wordlist of arbitrary length.
|
||||||
|
Then, using the same wordlist, be able to decode the words back to the same data as before.
|
||||||
|
|
||||||
|
I will expand on the concept and techniques used later, for now, a working reference implementation can be found in
|
||||||
|
freespeech.py. A sample wordlist can be found in english.txt.
|
5
test.sh
Executable file
5
test.sh
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
count=$1
|
||||||
|
bs=$2
|
||||||
|
dd if=/dev/urandom count=$count bs=$bs 2>/dev/null | tee >(md5sum 1>&2) >(./freespeech.py english.txt | ./freespeech.py -d english.txt | md5sum 1>&2) 1>/dev/null 2>&1 | cat
|
||||||
|
#dd if=/dev/urandom count=$count bs=$bs 2>/dev/null | tee >(md5sum 1>&2) >(base64 | base64 -di | md5sum 1>&2) 1>/dev/null 2>&1 | cat
|
Loading…
Reference in New Issue
Block a user