【数据压缩】–LZW算法

大家好，欢迎来到IT知识分享网。

LZW算法原理

LZW编码

编码算法的思想

数据结构分析


尾缀字符	suffix
母节点	parent
第一个孩子节点	firstchild
下一个兄弟节点	nextsibling

struct { 
    int suffix; int parent, firstchild, nextsibling; } dictionary[MAX_CODE+1]; int next_code; int d_stack[MAX_CODE]; // stack for decoding a phrase

主要功能模块

初始化词典

void InitDictionary( void){ 
   //初始化词典，将0-255根节点初始化 int i; for( i=0; i<256; i++){ 
    dictionary[i].suffix = i;//尾缀字符 dictionary[i].parent = -1;//母节点初始化为空 dictionary[i].firstchild = -1;//子节点初始化为空 dictionary[i].nextsibling = i+1;//下一个兄弟节点 } dictionary[255].nextsibling = -1;//最后一个节点无下一个兄弟 next_code = 256;//新词条编码为256 }

查找词典中是否有字符串

将当前字符依次与当前前缀的孩子节点进行比较，相同时说明词典中有该字符串，否则说明词典中没有该字符串，返回-1。

int InDictionary( int character, int string_code){ 
   //查找词典中是否有字符串 int sibling; if( 0>string_code) return character;//string_code==-1,当前词条无前缀，为单个字符，初始化后已经在词典中，返回此字符 sibling = dictionary[string_code].firstchild;//如果不是单个字符，找当前前缀的第一个孩子节点 while( -1<sibling){ 
    if( character == dictionary[sibling].suffix) return sibling;//如果此孩子节点的尾缀字符等于character，则当前词条在词典中，返回此孩子节点 sibling = dictionary[sibling].nextsibling;//否则，找当前前缀的下一个孩子节点 } return -1;//没有找到，返回-1 }

将新串加入词典

将新串加入词典，则需要添加新节点，设置新节点的suffix,parent,nextsibling,firstchild，另外还需建立新节点与当前前缀其他孩子节点的关系。

void AddToDictionary( int character, int string_code){ 
   //将新串加入词典 int firstsibling, nextsibling; if( 0>string_code) return;//string_code==-1,当前词条无前缀，为单个字符，初始化后已经在词典中 dictionary[next_code].suffix = character;//添加新节点dictionary[next_code]，其尾缀字符为character dictionary[next_code].parent = string_code;//新节点的母节点为string_code dictionary[next_code].nextsibling = -1;//新节点还没有下一个兄弟 dictionary[next_code].firstchild = -1;//新节点还没有孩子 firstsibling = dictionary[string_code].firstchild;//firstsibling为当前前缀的第一个孩子 if( -1<firstsibling){ 
    // 如果当前前缀有孩子 nextsibling = firstsibling; while( -1<dictionary[nextsibling].nextsibling ) //只要nextsibling还有下一个兄弟 nextsibling = dictionary[nextsibling].nextsibling;//让nextsibling替换为它的下一个兄弟，这样可以找到当前前缀的最后一个孩子 dictionary[nextsibling].nextsibling = next_code;//当前前缀的最后一个孩子的下一个兄弟为新节点 }else{ 
   // 当前前缀无孩子,则新节点为它的第一个孩子 dictionary[string_code].firstchild = next_code; } next_code ++;//为下一个新词条所用 }

编码

如果当前字符串在词典中，将当前字符串设为前缀。如果当前字符串不在词典中，将当前字符串写入词典中，输出当前字符串前缀对应的码字，当前字符设为前缀。详见代码注释。

void LZWEncode( FILE *fp, BITFILE *bf){ 
   //LZW编码 int character;//当前字符C int string_code;//当前前缀P int index;//定义索引 unsigned long file_length;//定义文件长度 fseek( fp, 0, SEEK_END);//将指针置于文件末尾 file_length = ftell( fp);//得到当前文件长度 fseek( fp, 0, SEEK_SET);//将指针移回文件开始 BitsOutput( bf, file_length, 4*8); InitDictionary();//初始化词典 string_code = -1;//初始化后的词典为单个字符，无前缀 while( EOF!=(character=fgetc( fp))){ 
   //EOF：END OF FILE，每次从文件中读取一个字符，直到文件的最后一个字符 index = InDictionary( character, string_code);//查找当前字符串是否在词典中，如果不在，则index=-1,如果在，则返回的是当前字符串的后缀（单个字符时为该字符），index>=0 if( 0<=index){ 
    // string+character in dictionary 如果当前字符串在词典中 string_code = index;//将当前字符串设为前缀 }else{ 
    // string+character not in dictionary 如果当前字符串不在词典中 output( bf, string_code);//输出当前前缀对应的码字 if( MAX_CODE > next_code){ 
    // free space in dictionary，只要未超过词典容限 // add string+character to dictionary AddToDictionary( character, string_code);//将当前字符串加入到词典中 } string_code = character;//将当前字符C设为前缀P } } output( bf, string_code);//最后一个前缀无下一个字符，将其对应的码字输出 }

LZW解码

解码算法的思想

解码

LZWDecode

首先初始化词典，如果当前码字不在词典中，将character赋给d_stack[0]，并在d_stack[]中存储先前码字对应的字符串（倒序存放），如果当前码字在词典中，在d_stack[]中存储当前码字对应字符串（倒序存放），则可以得到先前码字或当前码字的第一个字符d_stack[phrase_length-1]。当前码字不在词典中时，新词条为当前前缀+当前前缀的第一个字符，将先前码字对应的字符串+先前码字对应字符串的第一个字符输出到字符流中。当前码字在词典中时，将当前码字对应的字符串输出到字符流中，新词条为当前前缀+当前字符串的第一个字符。最后将当前码字设为先前码字。

void LZWDecode( BITFILE *bf, FILE *fp){ 
   //LZW解码 //需填充 int character;//定义当前字符 int new_code, last_code;//定义当前码字和先前码字 int phrase_length;//定义字符串长度 unsigned long file_length;//定义输出文件长度 file_length = BitsInput(bf, 4 * 8);//解码后文件的大小 if (-1 == file_length) { 
    file_length = 0; } InitDictionary();//初始化词典，使在开始译码时词典包含所有可能的前缀根 last_code = -1;//译码前无先前码字 while (file_length > 0)//file_length > 0表示解码未完成，继续解码 { 
    new_code = input(bf);//从输入文件中读出一个码字，为当前码字 if (new_code >= next_code)//如果当前码字不在词典中 { 
    d_stack[0] = character;//先将character赋给d_stack[0] phrase_length = DecodeString(1, last_code);//从d_stack[1]开始存储先前码字对应的字符串， //d_stack[1]存储的是先前码字对应的字符串的最后一个字符，phrase_length为先前码字对应字符串的长度+1 //d_stack[phrase_length-1]存储的是先前码字对应字符串的第一个字符 } else//如果当前码字在词典中 { 
    phrase_length = DecodeString(0, new_code);//从d_stack[0]开始存储当前码字对应的字符串， //d_stack[0]存储的是当前码字对应字符串的最后一个字符,phrase_length为当前码字对应字符串的长度 // d_stack[phrase_length-1]存储的是当前码字对应字符串的第一个字符 } character = d_stack[phrase_length-1]; //当前码字不在词典中时，character为先前码字对应字符串的第一个字符 //当前码字在词典中时，character为当前码字对应字符串的第一个字符 while (0 < phrase_length)//循环，直到d_stack[]全部输出 { 
    phrase_length--; fputc(d_stack[phrase_length],fp);//d_stack[]为倒序存放，这样可以正序输出 //当前码字不在词典中时，将先前码字对应的字符串+先前码字对应字符串的第一个字符输出到字符流中 //当前码字在词典中时，将当前码字对应的字符串输出到字符流中 file_length--; } if (MAX_CODE > next_code)//未超过词典容限时 { 
    AddToDictionary(character, last_code); //当前码字不在词典中时，新词条为当前前缀+当前前缀的第一个字符 //当前码字在词典中时，新词条为当前前缀+当前字符串的第一个字符 } last_code = new_code;//将当前码字设为先前码字 } }

DecodeString

int DecodeString( int start, int code){ 
    //需填充 int count; count = start; while (0 <= code)//从最后一个节点开始，向上搜索并存入stack[]中，直到找到最后一个母节点 { 
    d_stack[count] = dictionary[code].suffix; code = dictionary[code].parent; count++;//记录次数 } return count;//count=code对应字符串的长度+start }

调试程序

编码

调用PrintDictionary()函数来输出256之后的词典

void PrintDictionary( void){ 
    int n; int count; for( n=256; n<next_code; n++){ 
    count = DecodeString( 0, n); printf( "%4d->", n); while( 0<count--) printf("%c", (char)(d_stack[count])); printf( "\n"); } }

得到的结果和理论相同。编码正确。

解码

压缩效率分析

十种不同格式文件的压缩效率

更改词典总数上限后的编码效率

增大词典总数上限后，数据量较第一次压缩后的结果更小了，但除了6.avi数据量较原文件有所减小，其他文件压缩后的数据量还是大于原文件。对于词典编码总数仍达到上限的文件，可能是MAXCODE的值仍旧太小，数据量过大，导致压缩算法未充分利用。词典编码总数未达到上界，压缩后数据量大于原文件的，可能是文件中重复字符的概率小，LZW压缩算法不能发挥优势。

字符重复率很高的文本文档

LZW算法的优缺点

由此可见，字符串重复率低时，压缩效率受到很大的影响，这是LZW算法的一大局限。它的优点是只需一遍扫描，具有自适应的特点，算法简单，便于快速实现。

程序代码

bitio.h

/* * Declaration for bitwise IO * * vim: ts=4 sw=4 cindent */ #ifndef __BITIO__ #define __BITIO__ #include <stdio.h> typedef struct{ 
    FILE *fp; unsigned char mask; int rack; }BITFILE; BITFILE *OpenBitFileInput( char *filename); BITFILE *OpenBitFileOutput( char *filename); void CloseBitFileInput( BITFILE *bf); void CloseBitFileOutput( BITFILE *bf); int BitInput( BITFILE *bf); unsigned long BitsInput( BITFILE *bf, int count); void BitOutput( BITFILE *bf, int bit); void BitsOutput( BITFILE *bf, unsigned long code, int count); #endif // __BITIO__

bitio.c

/* * Definitions for bitwise IO * * vim: ts=4 sw=4 cindent */ #include <stdlib.h> #include <stdio.h> #include "bitio.h" BITFILE *OpenBitFileInput( char *filename){ 
    BITFILE *bf; bf = (BITFILE *)malloc( sizeof(BITFILE)); if( NULL == bf) return NULL; if( NULL == filename) bf->fp = stdin; else bf->fp = fopen( filename, "rb"); if( NULL == bf->fp) return NULL; bf->mask = 0x80; bf->rack = 0; return bf; } BITFILE *OpenBitFileOutput( char *filename){ 
    BITFILE *bf; bf = (BITFILE *)malloc( sizeof(BITFILE)); if( NULL == bf) return NULL; if( NULL == filename) bf->fp = stdout; else bf->fp = fopen( filename, "wb"); if( NULL == bf->fp) return NULL; bf->mask = 0x80; bf->rack = 0; return bf; } void CloseBitFileInput( BITFILE *bf){ 
    fclose( bf->fp); free( bf); } void CloseBitFileOutput( BITFILE *bf){ 
    // Output the remaining bits if( 0x80 != bf->mask) fputc( bf->rack, bf->fp); fclose( bf->fp); free( bf); } int BitInput( BITFILE *bf){ 
    int value; if( 0x80 == bf->mask){ 
    bf->rack = fgetc( bf->fp); if( EOF == bf->rack){ 
    fprintf(stderr, "Read after the end of file reached\n"); exit( -1); } } value = bf->mask & bf->rack; bf->mask >>= 1; if( 0==bf->mask) bf->mask = 0x80; return( (0==value)?0:1); } unsigned long BitsInput( BITFILE *bf, int count){ 
    unsigned long mask; unsigned long value; mask = 1L << (count-1); value = 0L; while( 0!=mask){ 
    if( 1 == BitInput( bf)) value |= mask; mask >>= 1; } return value; } void BitOutput( BITFILE *bf, int bit){ 
    if( 0 != bit) bf->rack |= bf->mask; bf->mask >>= 1; if( 0 == bf->mask){ 
    // eight bits in rack fputc( bf->rack, bf->fp); bf->rack = 0; bf->mask = 0x80; } } void BitsOutput( BITFILE *bf, unsigned long code, int count){ 
    unsigned long mask; mask = 1L << (count-1); while( 0 != mask){ 
    BitOutput( bf, (int)(0==(code&mask)?0:1)); mask >>= 1; } } #if 0 int main( int argc, char **argv){ 
    BITFILE *bfi, *bfo; int bit; int count = 0; if( 1<argc){ 
    if( NULL==OpenBitFileInput( bfi, argv[1])){ 
    fprintf( stderr, "fail open the file\n"); return -1; } }else{ 
    if( NULL==OpenBitFileInput( bfi, NULL)){ 
    fprintf( stderr, "fail open stdin\n"); return -2; } } if( 2<argc){ 
    if( NULL==OpenBitFileOutput( bfo, argv[2])){ 
    fprintf( stderr, "fail open file for output\n"); return -3; } }else{ 
    if( NULL==OpenBitFileOutput( bfo, NULL)){ 
    fprintf( stderr, "fail open stdout\n"); return -4; } } while( 1){ 
    bit = BitInput( bfi); fprintf( stderr, "%d", bit); count ++; if( 0==(count&7))fprintf( stderr, " "); BitOutput( bfo, bit); } return 0; } #endif

lzw.c

/* * Definition for LZW coding * * vim: ts=4 sw=4 cindent nowrap */ #include <stdlib.h> #include <stdio.h> #include "bitio.h" #define MAX_CODE 65535 //#define MAX_CODE  struct { 
    int suffix; int parent, firstchild, nextsibling; } dictionary[MAX_CODE+1]; int next_code; int d_stack[MAX_CODE]; // stack for decoding a phrase #define input(f) ((int)BitsInput( f, 16)) #define output(f, x) BitsOutput( f, (unsigned long)(x), 16) int DecodeString( int start, int code); void InitDictionary( void); void PrintDictionary( void){ 
    int n; int count; for( n=256; n<next_code; n++){ 
    count = DecodeString( 0, n); printf( "%4d->", n); while( 0<count--) printf("%c", (char)(d_stack[count])); printf( "\n"); } } int DecodeString( int start, int code){ 
    //需填充 int count; count = start; while (0 <= code)//从最后一个节点开始，向上搜索并存入stack[]中，直到找到最后一个母节点 { 
    d_stack[count] = dictionary[code].suffix; code = dictionary[code].parent; count++;//记录次数 } return count;//count=code对应字符串的长度+start } void InitDictionary( void){ 
   //初始化词典，将0-255根节点初始化 int i; for( i=0; i<256; i++){ 
    dictionary[i].suffix = i;//尾缀字符 dictionary[i].parent = -1;//母节点初始化为空 dictionary[i].firstchild = -1;//子节点初始化为空 dictionary[i].nextsibling = i+1;//下一个兄弟节点 } dictionary[255].nextsibling = -1;//最后一个节点无下一个兄弟 next_code = 256;//新词条编码为256 } /* * Input: string represented by string_code in dictionary, * Output: the index of character+string in the dictionary * index = -1 if not found */ int InDictionary( int character, int string_code){ 
   //查找词典中是否有字符串 int sibling; if( 0>string_code) return character;//string_code==-1,当前词条无前缀，为单个字符，初始化后已经在词典中，返回此字符 sibling = dictionary[string_code].firstchild;//如果不是单个字符，找当前前缀的第一个孩子节点 while( -1<sibling){ 
    if( character == dictionary[sibling].suffix) return sibling;//如果此孩子节点的尾缀字符等于character，则当前词条在词典中，返回此孩子节点 sibling = dictionary[sibling].nextsibling;//否则，找当前前缀的下一个孩子节点 } return -1;//没有找到，返回-1 } void AddToDictionary( int character, int string_code){ 
   //将新串加入词典 int firstsibling, nextsibling; if( 0>string_code) return;//string_code==-1,当前词条无前缀，为单个字符，初始化后已经在词典中 dictionary[next_code].suffix = character;//添加新节点dictionary[next_code]，其尾缀字符为character dictionary[next_code].parent = string_code;//新节点的母节点为string_code dictionary[next_code].nextsibling = -1;//新节点还没有下一个兄弟 dictionary[next_code].firstchild = -1;//新节点还没有孩子 firstsibling = dictionary[string_code].firstchild;//firstsibling为当前前缀的第一个孩子 if( -1<firstsibling){ 
    // 如果当前前缀有孩子 nextsibling = firstsibling; while( -1<dictionary[nextsibling].nextsibling ) //只要nextsibling还有下一个兄弟 nextsibling = dictionary[nextsibling].nextsibling;//让nextsibling替换为它的下一个兄弟，这样可以找到当前前缀的最后一个孩子 dictionary[nextsibling].nextsibling = next_code;//当前前缀的最后一个孩子的下一个兄弟为新节点 }else{ 
   // 当前前缀无孩子,则新节点为它的第一个孩子 dictionary[string_code].firstchild = next_code; } next_code ++;//为下一个新词条所用 } void LZWEncode( FILE *fp, BITFILE *bf){ 
   //LZW编码 int character;//当前字符C int string_code;//当前前缀P int index;//定义索引 unsigned long file_length;//定义文件长度 fseek( fp, 0, SEEK_END);//将指针置于文件末尾 file_length = ftell( fp);//得到当前文件长度 fseek( fp, 0, SEEK_SET);//将指针移回文件开始 BitsOutput( bf, file_length, 4*8); InitDictionary();//初始化词典 string_code = -1;//初始化后的词典为单个字符，无前缀 while( EOF!=(character=fgetc( fp))){ 
   //EOF：END OF FILE，每次从文件中读取一个字符，直到文件的最后一个字符 index = InDictionary( character, string_code);//查找当前字符串是否在词典中，如果不在，则index=-1,如果在，则返回的是当前字符串的后缀（单个字符时为该字符），index>=0 if( 0<=index){ 
    // string+character in dictionary 如果当前字符串在词典中 string_code = index;//将当前字符串初始化为前缀 }else{ 
    // string+character not in dictionary 如果当前字符串不在词典中 output( bf, string_code);//输出当前前缀对应的码字 if( MAX_CODE > next_code){ 
    // free space in dictionary，只要未超过词典容限 // add string+character to dictionary AddToDictionary( character, string_code);//将当前字符串加入到词典中 } string_code = character;//将当前字符C设为前缀P } } output( bf, string_code);//最后一个前缀无下一个字符，将其对应的码字输出 PrintDictionary(); } void LZWDecode( BITFILE *bf, FILE *fp){ 
   //LZW解码 //需填充 int character;//定义当前字符 int new_code, last_code;//定义当前码字和先前码字 int phrase_length;//定义字符串长度 unsigned long file_length;//定义输出文件长度 file_length = BitsInput(bf, 4 * 8);//解码后文件的大小 if (-1 == file_length) { 
    file_length = 0; } InitDictionary();//初始化词典，使在开始译码时词典包含所有可能的前缀根 last_code = -1;//译码前无先前码字 while (file_length > 0)//file_length > 0表示解码未完成，继续解码 { 
    new_code = input(bf);//从输入文件中读出一个码字，为当前码字 if (new_code >= next_code)//如果当前码字不在词典中 { 
    d_stack[0] = character;//先将character赋给d_stack[0] phrase_length = DecodeString(1, last_code);//从d_stack[1]开始存储先前码字对应的字符串， //d_stack[1]存储的是先前码字对应的字符串的最后一个字符，phrase_length为先前码字对应字符串的长度+1 //d_stack[phrase_length-1]存储的是先前码字对应字符串的第一个字符 } else//如果当前码字在词典中 { 
    phrase_length = DecodeString(0, new_code);//从d_stack[0]开始存储当前码字对应的字符串， //d_stack[0]存储的是当前码字对应字符串的最后一个字符,phrase_length为当前码字对应字符串的长度 // d_stack[phrase_length-1]存储的是当前码字对应字符串的第一个字符 } character = d_stack[phrase_length-1]; //当前码字不在词典中时，character为先前码字对应字符串的第一个字符 //当前码字在词典中时，character为当前码字对应字符串的第一个字符 while (0 < phrase_length)//循环，直到d_stack[]全部输出 { 
    phrase_length--; fputc(d_stack[phrase_length],fp);//d_stack[]为倒序存放，这样可以正序输出 //当前码字不在词典中时，将先前码字对应的字符串+先前码字对应字符串的第一个字符输出到字符流中 //当前码字在词典中时，将当前码字对应的字符串输出到字符流中 file_length--; } if (MAX_CODE > next_code)//未超过词典容限时 { 
    AddToDictionary(character, last_code); //当前码字不在词典中时，新词条为当前前缀+当前前缀的第一个字符 //当前码字在词典中时，新词条为当前前缀+当前字符串的第一个字符 } last_code = new_code;//将当前码字设为先前码字 } } int main( int argc, char **argv){ 
    FILE *fp; BITFILE *bf; if( 4>argc){ 
    fprintf( stdout, "usage: \n%s <o> <ifile> <ofile>\n", argv[0]); fprintf( stdout, "\t<o>: E or D reffers encode or decode\n"); fprintf( stdout, "\t<ifile>: input file name\n"); fprintf( stdout, "\t<ofile>: output file name\n"); return -1; } if( 'E' == argv[1][0]){ 
    // do encoding fp = fopen( argv[2], "rb"); bf = OpenBitFileOutput( argv[3]); if( NULL!=fp && NULL!=bf){ 
    LZWEncode( fp, bf); fclose( fp); CloseBitFileOutput( bf); fprintf( stdout, "encoding done\n"); } }else if( 'D' == argv[1][0]){ 
    // do decoding bf = OpenBitFileInput( argv[2]); fp = fopen( argv[3], "wb"); if( NULL!=fp && NULL!=bf){ 
    LZWDecode( bf, fp); fclose( fp); CloseBitFileInput( bf); fprintf( stdout, "decoding done\n"); } }else{ 
    // otherwise fprintf( stderr, "not supported operation\n"); } return 0; }

免责声明：本站所有文章内容,图片，视频等均是来源于用户投稿和互联网及文摘转载整编而成，不代表本站观点，不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益，请在线联系站长,一经查实,本站将立刻删除。本文来自网络,若有侵权，请联系删除，如若转载，请注明出处：https://haidsoft.com/118173.html

【数据压缩】–LZW算法

文章目录

LZW算法原理

LZW编码

编码算法的思想

数据结构分析

主要功能模块

初始化词典

查找词典中是否有字符串

将新串加入词典

编码

LZW解码

解码算法的思想

解码

LZWDecode

DecodeString

调试程序

编码

解码

压缩效率分析

十种不同格式文件的压缩效率

更改词典总数上限后的编码效率

字符重复率很高的文本文档

LZW算法的优缺点

程序代码

发表回复

【数据压缩】–LZW算法

文章目录

LZW算法原理

LZW编码

编码算法的思想

数据结构分析

主要功能模块

初始化词典

查找词典中是否有字符串

将新串加入词典

编码

LZW解码

解码算法的思想

解码

LZWDecode

DecodeString

调试程序

编码

解码

压缩效率分析

十种不同格式文件的压缩效率

更改词典总数上限后的编码效率

字符重复率很高的文本文档

LZW算法的优缺点

程序代码

相关推荐

发表回复