diff --git a/.gitignore b/.gitignore index 5a8510a..d05845d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ -bigfile.pdf -.venv \ No newline at end of file +.venv +useless.huff +useless.map +useless_decoded.map \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/__pycache__/huffman.cpython-313.pyc b/__pycache__/huffman.cpython-313.pyc new file mode 100644 index 0000000..d98cf08 Binary files /dev/null and b/__pycache__/huffman.cpython-313.pyc differ diff --git a/huffman.py b/huffman.py index d25636a..13714ee 100644 --- a/huffman.py +++ b/huffman.py @@ -1,22 +1,108 @@ -import heapq as hq -from typing import BinaryIO, Dict -from collections import Counter +from typing import BinaryIO, List, Tuple +from dataclasses import dataclass -def count_frequencies(fd: BinaryIO) -> Dict[int, int]: - entropy = list() +# structure implementation in Python +@dataclass +class HTLS: # short alias of Huffman Tree List Structure + left: int | None + right: int | None + value: Tuple[int | None, int] +# Step 1: Count frequencies +def get_frequencies(fd: BinaryIO): + frequencies = [0] * 256 content = fd.read() for byte in content: - entropy.append(byte) + frequencies[byte] += 1 - byte_counter = Counter(entropy) + return frequencies - return dict(byte_counter) -def make_tree(frequencies_table: Dict[int, int]): - ft = frequencies_table # short alias +def make_tree(frequencies_table: List[int]): + nodes = [HTLS(left=None, right=None, value=(i, freq)) + for i, freq in enumerate(frequencies_table)] - huffman_tree = hq.heapify(list()) + alive = list(range(len(nodes))) - # TODO: Реализовать логику дерева хаффмана, пока что остановимся на этом \ No newline at end of file + while len(alive) > 1: + alive.sort(key=lambda idx: nodes[idx].value[1]) + + i1 = alive.pop(0) + i2 = alive.pop(0) + freq1 = nodes[i1].value[1] + freq2 = nodes[i2].value[1] + + new_node = HTLS( + left=i1, + right=i2, + value=(None, freq1 + freq2) + ) + nodes.append(new_node) + alive.append(len(nodes) - 1) + + return nodes, alive[0] + +def make_codes(nodes: List[HTLS], root: int): + codes = {} + stack = [(root, 0, 0)] + + while stack: + idx, code, length = stack.pop() + node = nodes[idx] + + if node.value[0] is not None: + byte = node.value[0] + codes[byte] = (code, length) + else: + if node.right is not None: + stack.append((node.right, (code << 1) | 1, length + 1)) + if node.left is not None: + stack.append((node.left, (code << 1) | 0, length + 1)) + + return codes + +def encode_flow(input_fd, output_fd, codes): + buffer = 0 + buffer_len = 0 + + input_fd.seek(0) + content = input_fd.read() + + for byte in content: + code, length = codes[byte] + + buffer = (buffer << length) | code + buffer_len += length + + while buffer_len >= 8: + buffer_len -= 8 + to_write = (buffer >> buffer_len) & 0xFF + output_fd.write(bytes([to_write])) + + if buffer_len > 0: + to_write = (buffer << (8 - buffer_len)) & 0xFF + output_fd.write(bytes([to_write])) + +def decode_flow(input_fd, output_fd, nodes, root_idx, total_bytes): + input_fd.seek(0) + current_node = root_idx + bytes_written = 0 + + while True: + byte_s = input_fd.read(1) + if not byte_s: + break + b = byte_s[0] + + for i in reversed(range(8)): + bit = (b >> i) & 1 + node = nodes[current_node] + current_node = node.right if bit else node.left + + if nodes[current_node].value[0] is not None: + output_fd.write(bytes([nodes[current_node].value[0]])) + bytes_written += 1 + if bytes_written >= total_bytes: + return + current_node = root_idx \ No newline at end of file diff --git a/main.py b/main.py index ea6a1d1..8514cd7 100644 --- a/main.py +++ b/main.py @@ -1 +1,22 @@ -# заглушка \ No newline at end of file +from huffman import ( + get_frequencies, + make_tree, + make_codes, + encode_flow, + decode_flow +) + +with open("useless.map", "rb") as f_in: + frequencies = get_frequencies(f_in) + total_bytes = f_in.seek(0, 2) # длина исходного файла + f_in.seek(0) + +nodes, root_idx = make_tree(frequencies) + +codes = make_codes(nodes, root_idx) + +with open("useless.map", "rb") as f_in, open("useless.huff", "wb") as f_out: + encode_flow(f_in, f_out, codes) + +with open("useless.huff", "rb") as f_in, open("useless_decoded.map", "wb") as f_out: + decode_flow(f_in, f_out, nodes, root_idx, total_bytes) \ No newline at end of file