Huffman complete v0.1
This commit is contained in:
parent
c1ca54db6b
commit
458201e00b
5 changed files with 130 additions and 15 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -1,2 +1,4 @@
|
|||
bigfile.pdf
|
||||
.venv
|
||||
useless.huff
|
||||
useless.map
|
||||
useless_decoded.map
|
||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
BIN
__pycache__/huffman.cpython-313.pyc
Normal file
BIN
__pycache__/huffman.cpython-313.pyc
Normal file
Binary file not shown.
110
huffman.py
110
huffman.py
|
|
@ -1,22 +1,108 @@
|
|||
import heapq as hq
|
||||
from typing import BinaryIO, Dict
|
||||
from collections import Counter
|
||||
from typing import BinaryIO, List, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
def count_frequencies(fd: BinaryIO) -> Dict[int, int]:
|
||||
entropy = list()
|
||||
# structure implementation in Python
|
||||
@dataclass
|
||||
class HTLS: # short alias of Huffman Tree List Structure
|
||||
left: int | None
|
||||
right: int | None
|
||||
value: Tuple[int | None, int]
|
||||
|
||||
# Step 1: Count frequencies
|
||||
def get_frequencies(fd: BinaryIO):
|
||||
frequencies = [0] * 256
|
||||
content = fd.read()
|
||||
|
||||
for byte in content:
|
||||
entropy.append(byte)
|
||||
frequencies[byte] += 1
|
||||
|
||||
byte_counter = Counter(entropy)
|
||||
return frequencies
|
||||
|
||||
return dict(byte_counter)
|
||||
|
||||
def make_tree(frequencies_table: Dict[int, int]):
|
||||
ft = frequencies_table # short alias
|
||||
def make_tree(frequencies_table: List[int]):
|
||||
nodes = [HTLS(left=None, right=None, value=(i, freq))
|
||||
for i, freq in enumerate(frequencies_table)]
|
||||
|
||||
huffman_tree = hq.heapify(list())
|
||||
alive = list(range(len(nodes)))
|
||||
|
||||
# TODO: Реализовать логику дерева хаффмана, пока что остановимся на этом
|
||||
while len(alive) > 1:
|
||||
alive.sort(key=lambda idx: nodes[idx].value[1])
|
||||
|
||||
i1 = alive.pop(0)
|
||||
i2 = alive.pop(0)
|
||||
freq1 = nodes[i1].value[1]
|
||||
freq2 = nodes[i2].value[1]
|
||||
|
||||
new_node = HTLS(
|
||||
left=i1,
|
||||
right=i2,
|
||||
value=(None, freq1 + freq2)
|
||||
)
|
||||
nodes.append(new_node)
|
||||
alive.append(len(nodes) - 1)
|
||||
|
||||
return nodes, alive[0]
|
||||
|
||||
def make_codes(nodes: List[HTLS], root: int):
|
||||
codes = {}
|
||||
stack = [(root, 0, 0)]
|
||||
|
||||
while stack:
|
||||
idx, code, length = stack.pop()
|
||||
node = nodes[idx]
|
||||
|
||||
if node.value[0] is not None:
|
||||
byte = node.value[0]
|
||||
codes[byte] = (code, length)
|
||||
else:
|
||||
if node.right is not None:
|
||||
stack.append((node.right, (code << 1) | 1, length + 1))
|
||||
if node.left is not None:
|
||||
stack.append((node.left, (code << 1) | 0, length + 1))
|
||||
|
||||
return codes
|
||||
|
||||
def encode_flow(input_fd, output_fd, codes):
|
||||
buffer = 0
|
||||
buffer_len = 0
|
||||
|
||||
input_fd.seek(0)
|
||||
content = input_fd.read()
|
||||
|
||||
for byte in content:
|
||||
code, length = codes[byte]
|
||||
|
||||
buffer = (buffer << length) | code
|
||||
buffer_len += length
|
||||
|
||||
while buffer_len >= 8:
|
||||
buffer_len -= 8
|
||||
to_write = (buffer >> buffer_len) & 0xFF
|
||||
output_fd.write(bytes([to_write]))
|
||||
|
||||
if buffer_len > 0:
|
||||
to_write = (buffer << (8 - buffer_len)) & 0xFF
|
||||
output_fd.write(bytes([to_write]))
|
||||
|
||||
def decode_flow(input_fd, output_fd, nodes, root_idx, total_bytes):
|
||||
input_fd.seek(0)
|
||||
current_node = root_idx
|
||||
bytes_written = 0
|
||||
|
||||
while True:
|
||||
byte_s = input_fd.read(1)
|
||||
if not byte_s:
|
||||
break
|
||||
b = byte_s[0]
|
||||
|
||||
for i in reversed(range(8)):
|
||||
bit = (b >> i) & 1
|
||||
node = nodes[current_node]
|
||||
current_node = node.right if bit else node.left
|
||||
|
||||
if nodes[current_node].value[0] is not None:
|
||||
output_fd.write(bytes([nodes[current_node].value[0]]))
|
||||
bytes_written += 1
|
||||
if bytes_written >= total_bytes:
|
||||
return
|
||||
current_node = root_idx
|
||||
23
main.py
23
main.py
|
|
@ -1 +1,22 @@
|
|||
# заглушка
|
||||
from huffman import (
|
||||
get_frequencies,
|
||||
make_tree,
|
||||
make_codes,
|
||||
encode_flow,
|
||||
decode_flow
|
||||
)
|
||||
|
||||
with open("useless.map", "rb") as f_in:
|
||||
frequencies = get_frequencies(f_in)
|
||||
total_bytes = f_in.seek(0, 2) # длина исходного файла
|
||||
f_in.seek(0)
|
||||
|
||||
nodes, root_idx = make_tree(frequencies)
|
||||
|
||||
codes = make_codes(nodes, root_idx)
|
||||
|
||||
with open("useless.map", "rb") as f_in, open("useless.huff", "wb") as f_out:
|
||||
encode_flow(f_in, f_out, codes)
|
||||
|
||||
with open("useless.huff", "rb") as f_in, open("useless_decoded.map", "wb") as f_out:
|
||||
decode_flow(f_in, f_out, nodes, root_idx, total_bytes)
|
||||
Loading…
Add table
Add a link
Reference in a new issue