Huffman complete v0.1

This commit is contained in:
helldh 2026-01-18 17:09:28 +03:00
parent c1ca54db6b
commit 458201e00b
5 changed files with 130 additions and 15 deletions

6
.gitignore vendored
View file

@ -1,2 +1,4 @@
bigfile.pdf
.venv
.venv
useless.huff
useless.map
useless_decoded.map

6
.idea/vcs.xml generated Normal file
View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

Binary file not shown.

View file

@ -1,22 +1,108 @@
import heapq as hq
from typing import BinaryIO, Dict
from collections import Counter
from typing import BinaryIO, List, Tuple
from dataclasses import dataclass
def count_frequencies(fd: BinaryIO) -> Dict[int, int]:
entropy = list()
# structure implementation in Python
@dataclass
class HTLS: # short alias of Huffman Tree List Structure
left: int | None
right: int | None
value: Tuple[int | None, int]
# Step 1: Count frequencies
def get_frequencies(fd: BinaryIO):
frequencies = [0] * 256
content = fd.read()
for byte in content:
entropy.append(byte)
frequencies[byte] += 1
byte_counter = Counter(entropy)
return frequencies
return dict(byte_counter)
def make_tree(frequencies_table: Dict[int, int]):
ft = frequencies_table # short alias
def make_tree(frequencies_table: List[int]):
nodes = [HTLS(left=None, right=None, value=(i, freq))
for i, freq in enumerate(frequencies_table)]
huffman_tree = hq.heapify(list())
alive = list(range(len(nodes)))
# TODO: Реализовать логику дерева хаффмана, пока что остановимся на этом
while len(alive) > 1:
alive.sort(key=lambda idx: nodes[idx].value[1])
i1 = alive.pop(0)
i2 = alive.pop(0)
freq1 = nodes[i1].value[1]
freq2 = nodes[i2].value[1]
new_node = HTLS(
left=i1,
right=i2,
value=(None, freq1 + freq2)
)
nodes.append(new_node)
alive.append(len(nodes) - 1)
return nodes, alive[0]
def make_codes(nodes: List[HTLS], root: int):
codes = {}
stack = [(root, 0, 0)]
while stack:
idx, code, length = stack.pop()
node = nodes[idx]
if node.value[0] is not None:
byte = node.value[0]
codes[byte] = (code, length)
else:
if node.right is not None:
stack.append((node.right, (code << 1) | 1, length + 1))
if node.left is not None:
stack.append((node.left, (code << 1) | 0, length + 1))
return codes
def encode_flow(input_fd, output_fd, codes):
buffer = 0
buffer_len = 0
input_fd.seek(0)
content = input_fd.read()
for byte in content:
code, length = codes[byte]
buffer = (buffer << length) | code
buffer_len += length
while buffer_len >= 8:
buffer_len -= 8
to_write = (buffer >> buffer_len) & 0xFF
output_fd.write(bytes([to_write]))
if buffer_len > 0:
to_write = (buffer << (8 - buffer_len)) & 0xFF
output_fd.write(bytes([to_write]))
def decode_flow(input_fd, output_fd, nodes, root_idx, total_bytes):
input_fd.seek(0)
current_node = root_idx
bytes_written = 0
while True:
byte_s = input_fd.read(1)
if not byte_s:
break
b = byte_s[0]
for i in reversed(range(8)):
bit = (b >> i) & 1
node = nodes[current_node]
current_node = node.right if bit else node.left
if nodes[current_node].value[0] is not None:
output_fd.write(bytes([nodes[current_node].value[0]]))
bytes_written += 1
if bytes_written >= total_bytes:
return
current_node = root_idx

23
main.py
View file

@ -1 +1,22 @@
# заглушка
from huffman import (
get_frequencies,
make_tree,
make_codes,
encode_flow,
decode_flow
)
with open("useless.map", "rb") as f_in:
frequencies = get_frequencies(f_in)
total_bytes = f_in.seek(0, 2) # длина исходного файла
f_in.seek(0)
nodes, root_idx = make_tree(frequencies)
codes = make_codes(nodes, root_idx)
with open("useless.map", "rb") as f_in, open("useless.huff", "wb") as f_out:
encode_flow(f_in, f_out, codes)
with open("useless.huff", "rb") as f_in, open("useless_decoded.map", "wb") as f_out:
decode_flow(f_in, f_out, nodes, root_idx, total_bytes)