Source code for tcmpr.algorithms.lzw.compressor

"""
This module is responsible for implementing
LZW CODING algorithm which can be used to
compressing input file.
"""
import os


[docs]def compress_lzw(input_file):
    """
    Function compress input file and create new compressed
    file with extension ".lzw" (for decompression purposes)
    in place of use. Algorithm create dictionary of encoding
    base on input file and frequent subsequence of chars.
    :param input_file:
    :return: compressed file path
    """
    output_file_dir = os.path.dirname(input_file)
    output_filename = input_file + ".lzw"
    output_file_path = os.path.join(output_file_dir, output_filename)

    try:
        with open(input_file, "r", encoding='utf-8') as file:
            data = file.read()
    except UnicodeDecodeError:
        with open(input_file, "r", encoding='ISO-8859-1') as file:
            data = file.read()

    encoded_input_data = encode_data(data)

    # for decompression purpose save encoded data to file
    # with 2 bytes for each code
    with open(output_file_path, 'wb') as output:
        chunk_size = 2
        if max(encoded_input_data) > 2**15:
            chunk_size = 4
            output.write((4).to_bytes(2, byteorder='big'))
        else:
            output.write((2).to_bytes(2, byteorder='big'))
        for code in encoded_input_data:
            output.write(code.to_bytes(chunk_size, byteorder='big'))

    return output_file_path


[docs]def encode_data(data):
    """
    Create dictionary base on input data and encode data
    and look for frequent subsequences and assign them to
    proper codes in dictionary.
    :param data:
    :return:
    """
    dictionary = {chr(c): c for c in range(0, 256)}
    chars = dictionary.keys()
    max_code = 255

    index = 0
    data_length = len(data)
    codes = []
    z = data[index]
    while index < data_length -1:
        k = data[index + 1]
        if z + k in chars:
            z = z + k
            index += 1
        else:
            codes.append(dictionary[z])
            max_code += 1
            dictionary[z + k] = max_code
            z = k
            index += 1
    codes.append(dictionary[z])

    return codes