#!/usr/bin/env python
#-*- coding: utf8 -*-
from math import floor, log, ceil
class Arithmetic:
def __init__(self, freqs_tuples, MAX):
# верхняя граница интервала
self.max = MAX
# подготовка словаря с интервалами вероятностей символов
self.__freqs2dict( freqs_tuples )
def encode(self, s):
u"""
Кодирование строки s на основе вероятностей в self.freqs
Возвращает число из итогового диапазона
"""
# начинает с полного интервала [0,1)
(f, t) = (0, self.max)
for c in s:
# координаты нового интервала
(cf, ct) = self.freqs[c]
# длина базового интервала
t_f = t-f
print("base: %d\n" % t_f)
# Получение нового интервала.
# Важно: т.к. в вычислении фигурирует f, то его
# изменение выполняется только после вычисления t.
# Т.е. не надо менять строки местами : )
t = int( floor( ct*t_f / self.max + f ) )
f = int( floor( cf*t_f / self.max + f ) )
print("f: %d, t: %d, cf: %d, ct: %d, self.max: %d\n" % (f,t,cf,ct,self.max))
return ( f, t )
def decode(self, i):
u"""
Декодирование числа в строку на основе вероятностей в self.freqs
"""
res = []
c = None
while c != '#':
c = self.__find_interval( i )
res.append( c )
(f,t) = self.freqs[c]
i = int( ( i-f ) * self.max / ( t-f ) )
return ''.join(res)
def __freqs2dict(self, freqs_tuples):
u"""
Преобразование кортежа вероятностей символов строки в
словарь интервалов с учетом self.max
"""
self.freqs = {}
left = 0
for (k,v) in freqs:
self.freqs[k] = ( int(left*self.max), int((left+v)*self.max) )
left += v
def __optimize(self, f, t):
u"""
Подбор числа из диапазона [f, t) такого, чтобы в нем
было максимально возможное на интервале число нулевых младших бит
"""
#return int( (t-f)/2 + f )
f = int(f)
t = int(t)
x = f ^ t
if not x: return f
xl = int( ceil( log(x, 2) ) )
if not xl: return f
mask_and = ~((1<<xl)-1)
mask_or = 1<<(xl-1)
r = (f & mask_and) | mask_or
return r
def __find_interval(self, v):
u"""
Ищем интервал, которому принадлежит значение v
"""
for ( c, (cf, ct) ) in self.freqs.items():
if v>=cf and v<ct:
return c
freqs = ( ('a',0.4), ('b',0.2), ('h',0.1), ('r',0.25), ('#',0.05) )
src = 'abraha#'
MAX = 100000000
coder = Arithmetic( freqs, MAX )
print 'SOURCE: ', src
enc = coder.encode( src )
print 'ENCODED: ', enc
#dec = coder.decode( enc )
#print 'DECODED: ', dec
IyEvdXNyL2Jpbi9lbnYgcHl0aG9uCgojLSotIGNvZGluZzogdXRmOCAtKi0KCgpmcm9tIG1hdGggaW1wb3J0IGZsb29yLCBsb2csIGNlaWwKCgpjbGFzcyBBcml0aG1ldGljOgoKCiAgICBkZWYgX19pbml0X18oc2VsZiwgZnJlcXNfdHVwbGVzLCBNQVgpOgoKICAgICAgICAjINCy0LXRgNGF0L3Rj9GPINCz0YDQsNC90LjRhtCwINC40L3RgtC10YDQstCw0LvQsAoKICAgICAgICBzZWxmLm1heCA9IE1BWAoKICAgICAgICAjINC/0L7QtNCz0L7RgtC+0LLQutCwINGB0LvQvtCy0LDRgNGPINGBINC40L3RgtC10YDQstCw0LvQsNC80Lgg0LLQtdGA0L7Rj9GC0L3QvtGB0YLQtdC5INGB0LjQvNCy0L7Qu9C+0LIKCiAgICAgICAgc2VsZi5fX2ZyZXFzMmRpY3QoIGZyZXFzX3R1cGxlcyApCgoKICAgIGRlZiBlbmNvZGUoc2VsZiwgcyk6CgogICAgICAgIHUiIiIKCiAgICAgICAgICAgINCa0L7QtNC40YDQvtCy0LDQvdC40LUg0YHRgtGA0L7QutC4IHMg0L3QsCDQvtGB0L3QvtCy0LUg0LLQtdGA0L7Rj9GC0L3QvtGB0YLQtdC5INCyIHNlbGYuZnJlcXMKCiAgICAgICAgICAgINCS0L7Qt9Cy0YDQsNGJ0LDQtdGCINGH0LjRgdC70L4g0LjQtyDQuNGC0L7Qs9C+0LLQvtCz0L4g0LTQuNCw0L/QsNC30L7QvdCwCgogICAgICAgICIiIgoKICAgICAgICAjINC90LDRh9C40L3QsNC10YIg0YEg0L/QvtC70L3QvtCz0L4g0LjQvdGC0LXRgNCy0LDQu9CwIFswLDEpCgogICAgICAgIChmLCB0KSA9ICgwLCBzZWxmLm1heCkKCiAgICAgICAgZm9yIGMgaW4gczoKCiAgICAgICAgICAgICMg0LrQvtC+0YDQtNC40L3QsNGC0Ysg0L3QvtCy0L7Qs9C+INC40L3RgtC10YDQstCw0LvQsAoKICAgICAgICAgICAgKGNmLCBjdCkgPSBzZWxmLmZyZXFzW2NdCgogICAgICAgICAgICAjINC00LvQuNC90LAg0LHQsNC30L7QstC+0LPQviDQuNC90YLQtdGA0LLQsNC70LAKCiAgICAgICAgICAgIHRfZiA9IHQtZgogICAgICAgICAgICBwcmludCgiYmFzZTogJWRcbiIgJSB0X2YpCgogICAgICAgICAgICAjINCf0L7Qu9GD0YfQtdC90LjQtSDQvdC+0LLQvtCz0L4g0LjQvdGC0LXRgNCy0LDQu9CwLgoKICAgICAgICAgICAgIyDQktCw0LbQvdC+OiDRgi7Qui4g0LIg0LLRi9GH0LjRgdC70LXQvdC40Lgg0YTQuNCz0YPRgNC40YDRg9C10YIgZiwg0YLQviDQtdCz0L4KCiAgICAgICAgICAgICMg0LjQt9C80LXQvdC10L3QuNC1INCy0YvQv9C+0LvQvdGP0LXRgtGB0Y8g0YLQvtC70YzQutC+INC/0L7RgdC70LUg0LLRi9GH0LjRgdC70LXQvdC40Y8gdC4KCiAgICAgICAgICAgICMg0KIu0LUuINC90LUg0L3QsNC00L4g0LzQtdC90Y/RgtGMINGB0YLRgNC+0LrQuCDQvNC10YHRgtCw0LzQuCA6ICkKCiAgICAgICAgICAgIHQgPSBpbnQoIGZsb29yKCBjdCp0X2YgLyBzZWxmLm1heCArIGYgKSApCgogICAgICAgICAgICBmID0gaW50KCBmbG9vciggY2YqdF9mIC8gc2VsZi5tYXggKyBmICkgKQogICAgICAgICAgICBwcmludCgiZjogJWQsIHQ6ICVkLCBjZjogJWQsIGN0OiAlZCwgc2VsZi5tYXg6ICVkXG4iICUgKGYsdCxjZixjdCxzZWxmLm1heCkpCgogICAgICAgIHJldHVybiAoIGYsIHQgKQoKCiAgICBkZWYgZGVjb2RlKHNlbGYsIGkpOgoKICAgICAgICB1IiIiCgogICAgICAgICAgICDQlNC10LrQvtC00LjRgNC+0LLQsNC90LjQtSDRh9C40YHQu9CwINCyINGB0YLRgNC+0LrRgyDQvdCwINC+0YHQvdC+0LLQtSDQstC10YDQvtGP0YLQvdC+0YHRgtC10Lkg0LIgc2VsZi5mcmVxcwoKICAgICAgICAiIiIKCiAgICAgICAgcmVzID0gW10KCiAgICAgICAgYyA9IE5vbmUKCiAgICAgICAgd2hpbGUgYyAhPSAnIyc6CgogICAgICAgICAgICBjID0gc2VsZi5fX2ZpbmRfaW50ZXJ2YWwoIGkgKQoKICAgICAgICAgICAgcmVzLmFwcGVuZCggYyApCgogICAgICAgICAgICAoZix0KSA9IHNlbGYuZnJlcXNbY10KCiAgICAgICAgICAgIGkgPSBpbnQoICggaS1mICkgKiBzZWxmLm1heCAvICggdC1mICkgKQoKICAgICAgICByZXR1cm4gJycuam9pbihyZXMpCgoKICAgIGRlZiBfX2ZyZXFzMmRpY3Qoc2VsZiwgZnJlcXNfdHVwbGVzKToKCiAgICAgICAgdSIiIgoKICAgICAgICAgICAg0J/RgNC10L7QsdGA0LDQt9C+0LLQsNC90LjQtSDQutC+0YDRgtC10LbQsCDQstC10YDQvtGP0YLQvdC+0YHRgtC10Lkg0YHQuNC80LLQvtC70L7QsiDRgdGC0YDQvtC60Lgg0LIKCiAgICAgICAgICAgICAg0YHQu9C+0LLQsNGA0Ywg0LjQvdGC0LXRgNCy0LDQu9C+0LIg0YEg0YPRh9C10YLQvtC8IHNlbGYubWF4CgogICAgICAgICIiIgoKICAgICAgICBzZWxmLmZyZXFzID0ge30KCiAgICAgICAgbGVmdCA9IDAKCiAgICAgICAgZm9yIChrLHYpIGluIGZyZXFzOgoKICAgICAgICAgICAgc2VsZi5mcmVxc1trXSA9ICggaW50KGxlZnQqc2VsZi5tYXgpLCBpbnQoKGxlZnQrdikqc2VsZi5tYXgpICkKCiAgICAgICAgICAgIGxlZnQgKz0gdgoKCiAgICBkZWYgX19vcHRpbWl6ZShzZWxmLCBmLCB0KToKCiAgICAgICAgdSIiIgoKICAgICAgICAgICAg0J/QvtC00LHQvtGAINGH0LjRgdC70LAg0LjQtyDQtNC40LDQv9Cw0LfQvtC90LAgW2YsIHQpINGC0LDQutC+0LPQviwg0YfRgtC+0LHRiyDQsiDQvdC10LwKCiAgICAgICAgICAgINCx0YvQu9C+INC80LDQutGB0LjQvNCw0LvRjNC90L4g0LLQvtC30LzQvtC20L3QvtC1INC90LAg0LjQvdGC0LXRgNCy0LDQu9C1INGH0LjRgdC70L4g0L3Rg9C70LXQstGL0YUg0LzQu9Cw0LTRiNC40YUg0LHQuNGCCgogICAgICAgICIiIgoKICAgICAgICAjcmV0dXJuIGludCggKHQtZikvMiArIGYgKQoKICAgICAgICBmID0gaW50KGYpCgogICAgICAgIHQgPSBpbnQodCkKCiAgICAgICAgeCA9IGYgXiB0CgogICAgICAgIGlmIG5vdCB4OiByZXR1cm4gZgoKCiAgICAgICAgeGwgPSBpbnQoIGNlaWwoIGxvZyh4LCAyKSApICkKCiAgICAgICAgaWYgbm90IHhsOiByZXR1cm4gZgoKCiAgICAgICAgbWFza19hbmQgPSB+KCgxPDx4bCktMSkKCiAgICAgICAgbWFza19vciAgPSAxPDwoeGwtMSkKCgogICAgICAgIHIgPSAoZiAmIG1hc2tfYW5kKSB8IG1hc2tfb3IKCgogICAgICAgIHJldHVybiByCgoKICAgIGRlZiBfX2ZpbmRfaW50ZXJ2YWwoc2VsZiwgdik6CgogICAgICAgIHUiIiIKCiAgICAgICAgICAgINCY0YnQtdC8INC40L3RgtC10YDQstCw0LssINC60L7RgtC+0YDQvtC80YMg0L/RgNC40L3QsNC00LvQtdC20LjRgiDQt9C90LDRh9C10L3QuNC1IHYKCiAgICAgICAgIiIiCgogICAgICAgIGZvciAoIGMsIChjZiwgY3QpICkgaW4gc2VsZi5mcmVxcy5pdGVtcygpOgoKICAgICAgICAgICAgaWYgdj49Y2YgYW5kIHY8Y3Q6CgogICAgICAgICAgICAgICAgcmV0dXJuIGMKCgpmcmVxcyA9ICggKCdhJywwLjQpLCAoJ2InLDAuMiksICgnaCcsMC4xKSwgKCdyJywwLjI1KSwgKCcjJywwLjA1KSApCgpzcmMgPSAnYWJyYWhhIycKCk1BWCA9IDEwMDAwMDAwMAoKCmNvZGVyID0gQXJpdGhtZXRpYyggZnJlcXMsIE1BWCApCgoKcHJpbnQgJ1NPVVJDRTogICcsIHNyYwoKCmVuYyA9IGNvZGVyLmVuY29kZSggc3JjICkKCnByaW50ICdFTkNPREVEOiAnLCBlbmMKCgojZGVjID0gY29kZXIuZGVjb2RlKCBlbmMgKQoKI3ByaW50ICdERUNPREVEOiAnLCBkZWMKCg==
SOURCE: abraha#
base: 100000000
f: 0, t: 40000000, cf: 0, ct: 40000000, self.max: 100000000
base: 40000000
f: 16000000, t: 24000000, cf: 40000000, ct: 60000000, self.max: 100000000
base: 8000000
f: 21600000, t: 23600000, cf: 70000000, ct: 95000000, self.max: 100000000
base: 2000000
f: 21600000, t: 22400000, cf: 0, ct: 40000000, self.max: 100000000
base: 800000
f: 22080000, t: 22160000, cf: 60000000, ct: 70000000, self.max: 100000000
base: 80000
f: 22080000, t: 22112000, cf: 0, ct: 40000000, self.max: 100000000
base: 32000
f: 22110400, t: 22112000, cf: 95000000, ct: 100000000, self.max: 100000000
ENCODED: (22110400, 22112000)