`
oldrev
  • 浏览: 230218 次
  • 性别: Icon_minigender_1
  • 来自: 昆明
社区版块
存档分类
最新评论

Range Coding 的 D 实现

阅读更多

Range Coding 是算术编码的变种,二者的效率几乎没有差别,Range Coding 速度更快,且没有专利问题。下面的程序移植和改进自一个非常清晰简洁的C++实现。当然,直接使用下面的代码去压缩文件效果并不好,速度慢压缩率也低,Range Coding 更适合作为其他算法的后端,比如 LZ77、Block Sorting。

如果你看到这里一头雾水的话,可以上 wikipedia 参考“算术编码”,不过更好的选择是找一篇名为《笨笨数据压缩教程》的系列文章来入门。

D1.0 Code
  1. /** Code for range coding, derived from public domain work by Dmitry Subbotin
  2. Modified to use 64-bit integer maths, for increased precision
  3. Authors: Dmitry Subbotin (initial author of the Carry-less Range Coder)
  4. Sachin Garg (C++)
  5. Wei Li (D language)
  6. Reference: http://sachingarg.com/compression/entropy_coding/64bit/
  7. */
  8. import std.stdio;
  9. template RangeCoding64Base()
  10. {
  11. const ulong Top = 1UL << 56UL;
  12. const ulong Bottom = 1UL << 48UL;
  13. const ulong MaxRange = Bottom;
  14. ulong m_low = 0;
  15. ulong m_range = ulong.max;
  16. }
  17. struct RangeEncoding64
  18. {
  19. mixin RangeCoding64Base;
  20. private bool m_flushed = false ;
  21. private void delegate(ubyte) m_sink = null;
  22. void init( void delegate(ubyte) sink)
  23. {
  24. assert(sink !is null);
  25. m_sink = sink;
  26. }
  27. void close()
  28. {
  29. if (!m_flushed)
  30. flush();
  31. }
  32. void encode(ulong symbolLow, ulong symbolHigh, ulong totalRange)
  33. {
  34. m_range /= totalRange;
  35. m_low += symbolLow * m_range;
  36. m_range *= symbolHigh - symbolLow;
  37. while ((m_low ^ (m_low + m_range)) < Top || m_range < Bottom && ((m_range = -m_low & (Bottom - 1)), true ))
  38. {
  39. ubyte b = m_low >> (m_low.sizeof * 8 - 8);
  40. m_sink(b);
  41. m_range <<= 8;
  42. m_low <<= 8;
  43. }
  44. }
  45. void flush()
  46. {
  47. if (!m_flushed)
  48. {
  49. for ( int i = 0; i < m_low. sizeof ; i++)
  50. {
  51. ubyte b = m_low >> (m_low.sizeof * 8 - 8);
  52. m_sink(b);
  53. m_low <<= 8;
  54. }
  55. m_flushed = true ;
  56. }
  57. }
  58. }
  59. struct RangeDecoding64
  60. {
  61. mixin RangeCoding64Base;
  62. private ulong m_code;
  63. private ubyte delegate() m_emitter;
  64. void init(ubyte delegate() emitter)
  65. {
  66. assert(emitter !is null);
  67. m_emitter = emitter;
  68. for ( size_t i = 0; i < m_code. sizeof ; i++)
  69. {
  70. m_code = (m_code << 8) | emitter();
  71. }
  72. }
  73. ulong currentCount(ulong totalRange) //积累频率
  74. {
  75. return (m_code - m_low) / (m_range /= totalRange);
  76. }
  77. void decode(ulong symbolLow, ulong symbolHigh, ulong totalRange)
  78. {
  79. m_low += symbolLow * m_range;
  80. m_range *= symbolHigh - symbolLow;
  81. while ((m_low ^ m_low + m_range) < Top || m_range < Bottom && ((m_range = -m_low & Bottom - 1), true ))
  82. {
  83. m_code= m_code << 8 | m_emitter(), m_range <<= 8, m_low <<= 8;
  84. }
  85. }
  86. }
  87. //简单的0阶自适应模型
  88. struct OrderZeroModel(uint SymMax = 255)
  89. {
  90. public const uint SymbolMax = SymMax;
  91. public const uint NoOfSymbols = SymbolMax + 2;
  92. //最后一个元素是积累频率
  93. private ulong[SymbolMax + 2] m_freq;
  94. void init()
  95. {
  96. for ( size_t i = 0; i < m_freq.length; i++)
  97. {
  98. m_freq[i] = i;
  99. }
  100. }
  101. private void rescale()
  102. { //用了64位以后似乎这是不可能的
  103. ulong newTotal = 0;
  104. for ( size_t i = 1; i < m_freq.length - 1; i++)
  105. {
  106. newTotal += ((m_freq[i] - m_freq[i - 1]) / 2) + 1;
  107. m_freq[i] = m_freq[i - 1] + ((m_freq[i] - m_freq[i - 1]) / 2) + 1;
  108. }
  109. m_freq[m_freq.length - 1] = newTotal;
  110. }
  111. void update(uint sym)
  112. {
  113. for ( size_t i = sym + 1; i < m_freq.length; i++) {
  114. m_freq[i]++;
  115. }
  116. if (total() >= RangeCoding64Base!().MaxRange)
  117. rescale();
  118. }
  119. uint getSymbol(ulong n)
  120. {
  121. uint sym = SymbolMax;
  122. while (m_freq[sym] > n) --sym;
  123. return sym;
  124. }
  125. uint total()
  126. {
  127. return m_freq[m_freq.length - 1];
  128. }
  129. ulong low(uint sym)
  130. {
  131. return m_freq[sym];
  132. }
  133. ulong high(uint sym)
  134. {
  135. return m_freq[sym + 1];
  136. }
  137. ulong opIndex(uint rhs)
  138. {
  139. return m_freq[rhs];
  140. }
  141. }
  142. // sample:
  143. int main(string[] args)
  144. {
  145. const uint EndOfStream = 256;
  146. ubyte[] compressed;
  147. void sink(ubyte u)
  148. {
  149. compressed ~= u;
  150. }
  151. ubyte[] origin;
  152. for ( int i = 0; i < 10000; i++)
  153. origin ~= ['A', 'B', 'C', 'D', 'E', 'F', 'G'];
  154. RangeEncoding64 encoder;
  155. encoder.init(&sink);
  156. OrderZeroModel!(EndOfStream) model; //我们把 256 作为 Range Coding 流结束符
  157. model.init();
  158. writefln("compression started..." );
  159. foreach(ubyte b; origin)
  160. {
  161. encoder.encode(model.low(b), model.high(b), model.total);
  162. model.update(b);
  163. }
  164. encoder.encode(model.low(EndOfStream), model.high(EndOfStream), model.total);
  165. model.update(EndOfStream);
  166. encoder.close();
  167. writefln("originial size: %d" , origin.length);
  168. writefln("compressed size: %d (%d%%)" , compressed.length,
  169. (origin.length - compressed.length) * 100 / origin.length);
  170. writefln("decoding...." );
  171. size_t pos = 0;
  172. ubyte delegate() emitter = {
  173. return compressed[pos++];
  174. };
  175. model.init();
  176. RangeDecoding64 dec;
  177. dec.init(emitter);
  178. ubyte[] decompressed;
  179. while ( true )
  180. {
  181. ulong count = dec.currentCount(model.total);
  182. uint sym = model.getSymbol(count);
  183. if (sym == 256) break ;
  184. decompressed ~= sym;
  185. dec.decode(model.low(sym), model.high(sym), model.total);
  186. model.update(sym);
  187. }
  188. writefln(decompressed.length);
  189. //确保解码无误
  190. assert(decompressed[] == origin[]);
  191. return 0;
  192. }


分享到:
评论
8 楼 oldrev 2008-04-17  
引用

oldrev 2008-01-12
LZMA SDK 只是一个 LZMA 算法的实现,并不包括压缩包的处理,也不支持其他算法


此说法要修正了,新版 LZMA SDK 包含了 7z 压缩包处理
7 楼 oldrev 2008-01-12  
LZMA SDK 只是一个 LZMA 算法的实现,并不包括压缩包的处理,也不支持其他算法
6 楼 liuwangxia 2008-01-11  
另外 LZMA SDK 提供4种许可协议:

引用
LZMA SDK is available under any of the following licenses:

   1. GNU Lesser General Public License (GNU LGPL)
   2. Common Public License (CPL)
   3. Simplified license for unmodified code (read SPECIAL EXCEPTION)
   4. Proprietary license
5 楼 liuwangxia 2008-01-11  
LZMA SDK 目前支持 C, C++, C#, Java, 在 Linux 下有支持。

sudo apt-get install p7zip-full


http://7-zip.org/sdk.html
4 楼 oldrev 2007-08-21  
lzma sdk是lgpl,跟 tango 的 bsd 不兼容。

lzma 的实现就是 lz77 + range coding

7-zip 确实很出色,所有的算法和压缩包格式支持都是自己实现的,比什么 ark, file-roller 通过调用命令行程序之流牛得多。缺点是它的那个小类库完全是为 windows 设计的,造成整个软件没可移植性
3 楼 shawind 2007-08-21  
好像lzma是gpl吧?
2 楼 oldrev 2007-08-20  
引用
嗯,不错,建议提交到tango,tango目前在搞VFS,封装好一个文件级的压缩,就很接近了


这个直接压缩文件很慢,而且压缩率不高,tango如果需要实用的压缩的话可以考虑移植 7-zip 的lzma算法
1 楼 DavidL 2007-08-20  
嗯,不错,建议提交到tango,tango目前在搞VFS,封装好一个文件级的压缩,就很接近了

相关推荐

Global site tag (gtag.js) - Google Analytics