Source code for tcmpr.algorithms.lzw.compressor

"""
This module is responsible for implementing
LZW CODING algorithm which can be used to
compressing input file.
"""
import os


[docs]def compress_lzw(input_file): """ Function compress input file and create new compressed file with extension ".lzw" (for decompression purposes) in place of use. Algorithm create dictionary of encoding base on input file and frequent subsequence of chars. :param input_file: :return: compressed file path """ output_file_dir = os.path.dirname(input_file) output_filename = input_file + ".lzw" output_file_path = os.path.join(output_file_dir, output_filename) try: with open(input_file, "r", encoding='utf-8') as file: data = file.read() except UnicodeDecodeError: with open(input_file, "r", encoding='ISO-8859-1') as file: data = file.read() encoded_input_data = encode_data(data) # for decompression purpose save encoded data to file # with 2 bytes for each code with open(output_file_path, 'wb') as output: chunk_size = 2 if max(encoded_input_data) > 2**15: chunk_size = 4 output.write((4).to_bytes(2, byteorder='big')) else: output.write((2).to_bytes(2, byteorder='big')) for code in encoded_input_data: output.write(code.to_bytes(chunk_size, byteorder='big')) return output_file_path
[docs]def encode_data(data): """ Create dictionary base on input data and encode data and look for frequent subsequences and assign them to proper codes in dictionary. :param data: :return: """ dictionary = {chr(c): c for c in range(0, 256)} chars = dictionary.keys() max_code = 255 index = 0 data_length = len(data) codes = [] z = data[index] while index < data_length -1: k = data[index + 1] if z + k in chars: z = z + k index += 1 else: codes.append(dictionary[z]) max_code += 1 dictionary[z + k] = max_code z = k index += 1 codes.append(dictionary[z]) return codes