LWZ压缩编码原理与C++实现

大家好，欢迎来到IT知识分享网。

LWZ编码原理

LZW算法又叫“串表压缩算法，”就是通过建立一个字符串表，用较短的代码来表示较长的字符串来实现压缩，提取原始文本文件数据中的不同字符，基于这些字符创建一个编译表，然后用编译表中的字符的索引来替代原始文本文件数据中的相应字符，减少原始数据大小。

编码流程

解码流程

LWZ编码并不传输词典，而是在解码的过程中根据已有的编码规则动态建立词典，达到传输中编解码同步的效果。

C++代码：

bitio.h

#pragma once #ifndef __BITIO__ #define __BITIO__ #include <stdio.h> typedef struct { 
    FILE *fp; unsigned char mask; int rack; }BITFILE; BITFILE *OpenBitFileInput(char *filename); BITFILE *OpenBitFileOutput(char *filename); void CloseBitFileInput(BITFILE *bf); void CloseBitFileOutput(BITFILE *bf); int BitInput(BITFILE *bf); unsigned long BitsInput(BITFILE *bf, int count); void BitOutput(BITFILE *bf, int bit); void BitsOutput(BITFILE *bf, unsigned long code, int count); #endif // __BITIO__

bitio.cpp

#include <stdlib.h> #include <stdio.h> #include "bitio.h" #pragma warning(disable:4996)； BITFILE *OpenBitFileInput(char *filename) { 
    BITFILE *bf; bf = (BITFILE *)malloc(sizeof(BITFILE)); if (NULL == bf) return NULL; if (NULL == filename) bf->fp = stdin; else bf->fp = fopen(filename, "rb"); if (NULL == bf->fp) return NULL; bf->mask = 0x80; bf->rack = 0; return bf; } BITFILE *OpenBitFileOutput(char *filename) { 
    BITFILE *bf; bf = (BITFILE *)malloc(sizeof(BITFILE)); if (NULL == bf) return NULL; if (NULL == filename) bf->fp = stdout; else bf->fp = fopen(filename, "wb"); if (NULL == bf->fp) return NULL; bf->mask = 0x80; bf->rack = 0; return bf; } void CloseBitFileInput(BITFILE *bf) { 
    fclose(bf->fp); free(bf); } void CloseBitFileOutput(BITFILE *bf) { 
    // Output the remaining bits if (0x80 != bf->mask) fputc(bf->rack, bf->fp); fclose(bf->fp); free(bf); } int BitInput(BITFILE *bf) { 
    int value; if (0x80 == bf->mask) { 
    bf->rack = fgetc(bf->fp); if (EOF == bf->rack) { 
    fprintf(stderr, "Read after the end of file reached\n"); exit(-1); } } value = bf->mask & bf->rack; bf->mask >>= 1; if (0 == bf->mask) bf->mask = 0x80; return((0 == value) ? 0 : 1); } unsigned long BitsInput(BITFILE *bf, int count) { 
    unsigned long mask; unsigned long value; mask = 1L << (count - 1); value = 0L; while (0 != mask) { 
    if (1 == BitInput(bf)) value |= mask; mask >>= 1; } return value; } void BitOutput(BITFILE *bf, int bit) { 
    if (0 != bit) bf->rack |= bf->mask; bf->mask >>= 1; if (0 == bf->mask) { 
    // eight bits in rack fputc(bf->rack, bf->fp); bf->rack = 0; bf->mask = 0x80; } } void BitsOutput(BITFILE *bf, unsigned long code, int count) { 
    unsigned long mask; mask = 1L << (count - 1); while (0 != mask) { 
    BitOutput(bf, (int)(0 == (code&mask) ? 0 : 1)); mask >>= 1; } } #if 0 int main(int argc, char **argv) { 
    BITFILE *bfi, *bfo; int bit; int count = 0; if (1 < argc) { 
    if (NULL == OpenBitFileInput(bfi, argv[1])) { 
    fprintf(stderr, "fail open the file\n"); return -1; } } else { 
    if (NULL == OpenBitFileInput(bfi, NULL)) { 
    fprintf(stderr, "fail open stdin\n"); return -2; } } if (2 < argc) { 
    if (NULL == OpenBitFileOutput(bfo, argv[2])) { 
    fprintf(stderr, "fail open file for output\n"); return -3; } } else { 
    if (NULL == OpenBitFileOutput(bfo, NULL)) { 
    fprintf(stderr, "fail open stdout\n"); return -4; } } while (1) { 
    bit = BitInput(bfi); fprintf(stderr, "%d", bit); count++; if (0 == (count & 7))fprintf(stderr, " "); BitOutput(bfo, bit); } return 0; } #endif

LWZ_E.cpp

#include <stdlib.h> #include <stdio.h> #include "bitio.h" #include <iostream> #define MAX_CODE 65535 #pragma warning(disable:4996)； struct { 
    int suffix; int parent, firstchild, nextsibling; } dictionary[MAX_CODE + 1]; int next_code; int d_stack[MAX_CODE]; // stack for decoding a phrase #define input(f) ((int)BitsInput( f, 16)) #define output(f, x) BitsOutput( f, (unsigned long)(x), 16) int DecodeString(int start, int code); void InitDictionary(void); void PrintDictionary(void) { 
    int n; int count; for (n = 256; n < next_code; n++) { 
    count = DecodeString(0, n); printf("%4d->", n); while (0 < count--) printf("%c", (char)(d_stack[count])); printf("\n"); } } int DecodeString(int start, int code) { 
    int count; count = start; while (0 <= code) { 
    d_stack[count] = dictionary[code].suffix; code = dictionary[code].parent; count++; } return count; } void InitDictionary(void) { 
    int i; for (i = 0; i < 256; i++) { 
    dictionary[i].suffix = i; dictionary[i].parent = -1; dictionary[i].firstchild = -1; dictionary[i].nextsibling = i + 1; } dictionary[255].nextsibling = -1; next_code = 256; } /* * Input: string represented by string_code in dictionary, * Output: the index of character+string in the dictionary * index = -1 if not found */ int InDictionary(int character, int string_code) { 
    int sibling; if (0 > string_code) return character; sibling = dictionary[string_code].firstchild; while (-1 < sibling) { 
    if (character == dictionary[sibling].suffix) return sibling; sibling = dictionary[sibling].nextsibling; } return -1; } void AddToDictionary(int character, int string_code) { 
    int firstsibling, nextsibling; if (0 > string_code) return; dictionary[next_code].suffix = character; dictionary[next_code].parent = string_code; dictionary[next_code].nextsibling = -1; dictionary[next_code].firstchild = -1; firstsibling = dictionary[string_code].firstchild; if (-1 < firstsibling) { 
    // the parent has child nextsibling = firstsibling; while (-1 < dictionary[nextsibling].nextsibling) nextsibling = dictionary[nextsibling].nextsibling; dictionary[nextsibling].nextsibling = next_code; } else { 
   // no child before, modify it to be the first dictionary[string_code].firstchild = next_code; } next_code++; } void LZWEncode(FILE *fp, BITFILE *bf) { 
    int character; int string_code; int index; unsigned long file_length; fseek(fp, 0, SEEK_END); file_length = ftell(fp); fseek(fp, 0, SEEK_SET); BitsOutput(bf, file_length, 4 * 8); InitDictionary(); string_code = -1; while (EOF != (character = fgetc(fp))) { 
    index = InDictionary(character, string_code); if (0 <= index) { 
    // string+character in dictionary string_code = index; } else { 
    // string+character not in dictionary output(bf, string_code); if (MAX_CODE > next_code) { 
    // free space in dictionary // add string+character to dictionary AddToDictionary(character, string_code); } string_code = character; } } output(bf, string_code); } void LZWDecode(BITFILE *bf, FILE *fp) { 
    int character; int new_code, last_code; int phrase_length; unsigned long file_length; file_length = BitsInput(bf, 4 * 8); if (-1 == file_length) file_length = 0; InitDictionary(); last_code = -1;//初始化PW character=-1; while (file_length > 0) { 
    new_code = input(bf);//CW串第一个码字 if (next_code > new_code) { 
    phrase_length = DecodeString(0, new_code); // CW在词典中，输出CW所对应的字符 } else { 
    d_stack[0] = character; // CW不在词典中，输出PW+PW首字符 phrase_length = DecodeString(1, last_code); character = d_stack[phrase_length - 1]; //  while (phrase_length > 0) { 
    phrase_length--; fputc(d_stack[phrase_length], fp); // 输出解码后的字符流 file_length--; } if (MAX_CODE > next_code)// free space in dictionary { 
    // add string+character to dictionary AddToDictionary(character, last_code); } last_code = new_code; // PW = CW，返回循环进行下一个码字解码 } } int main(int argc, char **argv) { 
    FILE *fp; BITFILE *bf; if (4 > argc) { 
    fprintf(stdout, "usage: \n%s <o> <ifile> <ofile>\n", argv[0]); fprintf(stdout, "\t<o>: E or D reffers encode or decode\n"); fprintf(stdout, "\t<ifile>: input file name\n"); fprintf(stdout, "\t<ofile>: output file name\n"); return -1; } if ('E' == argv[1][0]) { 
    // do encoding fp = fopen(argv[2], "rb"); bf = OpenBitFileOutput(argv[3]); if (NULL != fp && NULL != bf) { 
    LZWEncode(fp, bf); fclose(fp); CloseBitFileOutput(bf); fprintf(stdout, "encoding done\n"); } } else if ('D' == argv[1][0]) { 
    // do decoding bf = OpenBitFileInput(argv[2]); fp = fopen(argv[3], "wb"); if (NULL != fp && NULL != bf) { 
    LZWDecode(bf, fp); fclose(fp); CloseBitFileInput(bf); fprintf(stdout, "decoding done\n"); } } else { 
    // otherwise fprintf(stderr, "not supported operation\n"); } return 0; }

最后进行LWZ压缩编解码测试：创建如下的文本文件，输入一串字符串

进行压缩编码，得到一个二进制文件（.dat)，再将这个二进制文件解码为txt文件

可见编解码成功。

压缩效率分析

文件类型	编码前文件大小	编码后文件大小	压缩比
.vwf	57KB	15KB	380％
.pdf	479KB	559KB	85.6%
.srt	400KB	124KB	322%
.docx	95KB	117KB	81.1%
.jpg	28KB	45KB	62.2%
.exe	6158KB	6259KB	98.3%
.pptx	266KB	334KB	67.6%
.txt	242KB	57KB	425%
.zip	974KB	1214KB	80.2%
.wav	862KB	1024KB	84.2%

从压缩效率分析来看，.txt 文件与.srt文件这样的文本类型文件得到了较大的压缩比，而其他文件，比如多媒体文件，则出现了负压缩，结果表明了LWZ压缩编码适合于文本文件这样重复内容较多的文件，而其他文件也许并不存在大量的重复数据，压缩效果并不好，在构建词典的过程中形成的词典比较庞大，导致了数据的增加。

免责声明：本站所有文章内容,图片，视频等均是来源于用户投稿和互联网及文摘转载整编而成，不代表本站观点，不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益，请在线联系站长,一经查实,本站将立刻删除。本文来自网络,若有侵权，请联系删除，如若转载，请注明出处：https://haidsoft.com/138099.html