From 458201e00bc14c5802d4128d3b16226a865314ef Mon Sep 17 00:00:00 2001 From: helldh Date: Sun, 18 Jan 2026 17:09:28 +0300 Subject: [PATCH] Huffman complete v0.1 --- .gitignore | 6 +- .idea/vcs.xml | 6 ++ __pycache__/huffman.cpython-313.pyc | Bin 0 -> 4643 bytes huffman.py | 110 +++++++++++++++++++++++++--- main.py | 23 +++++- 5 files changed, 130 insertions(+), 15 deletions(-) create mode 100644 .idea/vcs.xml create mode 100644 __pycache__/huffman.cpython-313.pyc diff --git a/.gitignore b/.gitignore index 5a8510a..d05845d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ -bigfile.pdf -.venv \ No newline at end of file +.venv +useless.huff +useless.map +useless_decoded.map \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/__pycache__/huffman.cpython-313.pyc b/__pycache__/huffman.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d98cf08f922b4856e567977542c9070b33fb024a GIT binary patch literal 4643 zcmbtY-A@}=7Qgdhd;EnlHsK?XK!AX00!{0DSxs8DAx%g~QYP-w4XQQR12HjUxMQ~^ zl`60Hq4q`<_BK(RZKYCarA;1IqEDNy_MzIp057zijHNyWI`~rGrbv`<;Y*i=9fbY&O=j(6~cnB6AnW5J$Nop7KKi z720r&zaS1-sHKgJSsKX!A`8t#7LW4fcaLtWpjO#3#szuV+DxWIYLjha{FsCbz?j6w z?6MuktT-k+#-yMqJK4BHb~O{~l-*FfCTjwo$EY(Pn9|SUnnLH!UpB>yaXn>PuBK-a zYJfBCF(su$6N;`!QM6Ko&Bpo_Xxt%VgmN;Wyv$Ldi;Rgff7nLg5VFt>^V|9&>szZM zVvxs8f}{m!WGl5z@&TJEp1XQcevDI>OfjKOz>a8q@;W@;R1#_R0SQQ^6p3icv>J(+ zj!0xW8A~Uy?TSQZ(@J7<#uJH5#HpT2#5Gk*!n6?AQYMTlnwCr{sd!S;BN2)Vne~x~ zjziIil1kC|csixxoEzrW;YZB={_DwUwLhk4aW&C@WiEPMq0?7ra!QS+^#1GViHT`N zt33N==BNa3u?qF;0jS<1s}6Tw%=Ipde#2Rl_vb>(V!h$4&HM7HLUU2h?=LyqzZBbP zedS$~pNK^{=v4|TL;J6fK?P9a`Z=&SjA~q%jBJ^Qe~Q^& z77oJ^bwTIx28O+aI7u4`bDbn0UZRM-#{`r>Vlts0;&u#eExh#Q78m57mS+P`N`bm zt$5~@Z)+Q}q9OT~rKaMmC28LmQqvQGcpKNO#MQ7Q?qOCkz1#FgQp$Kjeac=)N@lM{ zILsUNJLFo`MqzSvv+X0Vaj)^Z&`)%<5m+G&(ld3}Do96qlF>S#7*!Vmk(cG3ln z*SKl>HM|v8uGy+Bd!=>NURh`7QxmRLrNTmGR^~&u2~mDeNLUyFSF*E&?LiKm!;~b8 zCpow+VFY2iqsW$uK27{d3CmS`Sj0n&)a+0LU6~<8vx1D}CuD3}77L4E3)pP(FlZ>- zO_Q$LN5Hl_OIWJxl$~W4i7hK_*%D9>cf0lGmdp<<33oXR)`a~FGCM(#?}VFf1Ki{N zO^|@~Qb3>x6^izz2u<1y)l*PHHo(P`^&Y5Bb9cy9pd2#@^mi7S!x^9ve~D=da7=4} z2PYJJ+q7uOn5yfjvMp04iS$B3nI4ZRr_H_7$_+J=qN>{WLLwPe68dRqlqXx^4c!mb zKQmIPlkwV~i0%Q{jd>$H*J7n|4)8Ppec(smvLHLuD?WD%zxK z3JFy+MLkJVrZAJ7VOun<%FK+a#R3l62J4ty+!T>xrbS7_-&9RLeguBcnHC&8Qhq#V zN}Bps1P?_!affJ2$Xbjaj6Cu+ z7+#-I=Qn%}#rk`#OLg5(Y@*+hmDU`@UNi5?x$>hc(mq@i+PCFm{Rh8TXfM36B6V)G z5@$_$SMT?&wGo#$-&N!aJ=qtR#O8mkKLe-H5hwY^Ggd3J9JttIG8IK&REiZE%!R!R z?&=PCmVhhgWcUjT{hTI*1;is3t(bt0fXb%RC%hy2?eg2Lz4wK+~Y&(XbySqHN4E`|ev7ADLt?&fdVAVCY z;d>ZZca$GpXBYYEIvl1dyRNfkRj|-7S{9r}Kqv>rGNmmPcUSfn%y0Bz7oi`pv&*Cy z-b_mrxo%n@zlh$T9XKT7kZA?>O{T7!BIC_In9wmCY_g^jJROWSOb2}trg0JdPf$VJ zbk$@A4Is%tel|Ony_6aJhr?sEw10g5-uVwNl%%GtkUwU4{JEQ1an;x>|4HZH~rEn^H#@M^R*q0sr^@UZBZ~kWPX35h9G`Q<3xQellr|wPN4=x^gaPlGd zp#R~q&j&sWEw!FqseQ!&;kn#g$*zsb8Vl@ikDfr3g>88IkPe;0BnVc6)HU>HLSlI1EijL%ETlS`I2t?*Hv`EM>`t?X zxe-*gqilu#u)V*!!)-g=)UKfRe)tVcj4s?>CppAmpA z#aKiUw)6$8PNAHXN=BH{r$fMIBItx{5s)95NF?8)a4Td0e)>bG0L<>1%vr-*ml;|W zUGEM5ayZ|<0(o-Hp84tA^sQtiA8cqTypwYQp6eP*bq5w=C2w!mYDje@sU_=Jt!XR{ zyq`2&dy4J3%h@5LabKzN(4tss2xiY2uKM}Qxyv67E}U5CU2ZwN;_6$o5l>TbU_SZ8 zjfv$RA_13v^{(M>F3v85?!8;`_bqmoe8;kqA$jLLInPIy;@ick<;K7lQtz6DNaZ9n zx*ZPa>=2Ucc?RYG0m6c20kB*5k=?;8^A!-G!>%~{7-*ja2$!9;CheAk%2mf%(Ewza zLkN%N7@&CuXvb(59zb)39Tn{gq(EWkZm#yS6UNcU?I@b1tYrj~ryZrdM+|DiaN$57 z_zKxsu2u8CpVG4{xiC~9iLIkCq%e}K4ebNA9e$@wwBb(?{@WZlh0UO=eiD5ZqrEuBoI0Mh?Bma1YY?h{i=M^yD_C_xWl9*eBbeDaQ}Ah-h%iQ(u4tN4_$p$V zk?CfGj2`f*1eAg=rHF2F;}qNhF@^DXik^m+jwY-|0ce!-CudCEB2O4ZevYz#feOg8 zoie_?nTv+MJ~O=Ps4=`vcUuds#o1+Vd)E4mr`~XR4Odx2q(X`{kvBXO`R0`x~{b3tmXC@@~V|w(RTw z+TU#K@BWSJNj<6S1nN0GEFxSAxJ_$nZYHixQe+Snu|jB?&gy?6RGnh5V96}|j-++i z#f*}LN9I;2numS7GsTqzyZnEG({vPOfj&f!LA54u9QSw9QzAWoBkk~U$veO21@6Ea zfoffV?i%a5Y+OT8+92>)^Y7vIF8CLspEPX{Xs$i$<-GY=F<6*dBhb7rY~bU&gMRLr M^+WYs58K6m0SiKv;{X5v literal 0 HcmV?d00001 diff --git a/huffman.py b/huffman.py index d25636a..13714ee 100644 --- a/huffman.py +++ b/huffman.py @@ -1,22 +1,108 @@ -import heapq as hq -from typing import BinaryIO, Dict -from collections import Counter +from typing import BinaryIO, List, Tuple +from dataclasses import dataclass -def count_frequencies(fd: BinaryIO) -> Dict[int, int]: - entropy = list() +# structure implementation in Python +@dataclass +class HTLS: # short alias of Huffman Tree List Structure + left: int | None + right: int | None + value: Tuple[int | None, int] +# Step 1: Count frequencies +def get_frequencies(fd: BinaryIO): + frequencies = [0] * 256 content = fd.read() for byte in content: - entropy.append(byte) + frequencies[byte] += 1 - byte_counter = Counter(entropy) + return frequencies - return dict(byte_counter) -def make_tree(frequencies_table: Dict[int, int]): - ft = frequencies_table # short alias +def make_tree(frequencies_table: List[int]): + nodes = [HTLS(left=None, right=None, value=(i, freq)) + for i, freq in enumerate(frequencies_table)] - huffman_tree = hq.heapify(list()) + alive = list(range(len(nodes))) - # TODO: Реализовать логику дерева хаффмана, пока что остановимся на этом \ No newline at end of file + while len(alive) > 1: + alive.sort(key=lambda idx: nodes[idx].value[1]) + + i1 = alive.pop(0) + i2 = alive.pop(0) + freq1 = nodes[i1].value[1] + freq2 = nodes[i2].value[1] + + new_node = HTLS( + left=i1, + right=i2, + value=(None, freq1 + freq2) + ) + nodes.append(new_node) + alive.append(len(nodes) - 1) + + return nodes, alive[0] + +def make_codes(nodes: List[HTLS], root: int): + codes = {} + stack = [(root, 0, 0)] + + while stack: + idx, code, length = stack.pop() + node = nodes[idx] + + if node.value[0] is not None: + byte = node.value[0] + codes[byte] = (code, length) + else: + if node.right is not None: + stack.append((node.right, (code << 1) | 1, length + 1)) + if node.left is not None: + stack.append((node.left, (code << 1) | 0, length + 1)) + + return codes + +def encode_flow(input_fd, output_fd, codes): + buffer = 0 + buffer_len = 0 + + input_fd.seek(0) + content = input_fd.read() + + for byte in content: + code, length = codes[byte] + + buffer = (buffer << length) | code + buffer_len += length + + while buffer_len >= 8: + buffer_len -= 8 + to_write = (buffer >> buffer_len) & 0xFF + output_fd.write(bytes([to_write])) + + if buffer_len > 0: + to_write = (buffer << (8 - buffer_len)) & 0xFF + output_fd.write(bytes([to_write])) + +def decode_flow(input_fd, output_fd, nodes, root_idx, total_bytes): + input_fd.seek(0) + current_node = root_idx + bytes_written = 0 + + while True: + byte_s = input_fd.read(1) + if not byte_s: + break + b = byte_s[0] + + for i in reversed(range(8)): + bit = (b >> i) & 1 + node = nodes[current_node] + current_node = node.right if bit else node.left + + if nodes[current_node].value[0] is not None: + output_fd.write(bytes([nodes[current_node].value[0]])) + bytes_written += 1 + if bytes_written >= total_bytes: + return + current_node = root_idx \ No newline at end of file diff --git a/main.py b/main.py index ea6a1d1..8514cd7 100644 --- a/main.py +++ b/main.py @@ -1 +1,22 @@ -# заглушка \ No newline at end of file +from huffman import ( + get_frequencies, + make_tree, + make_codes, + encode_flow, + decode_flow +) + +with open("useless.map", "rb") as f_in: + frequencies = get_frequencies(f_in) + total_bytes = f_in.seek(0, 2) # длина исходного файла + f_in.seek(0) + +nodes, root_idx = make_tree(frequencies) + +codes = make_codes(nodes, root_idx) + +with open("useless.map", "rb") as f_in, open("useless.huff", "wb") as f_out: + encode_flow(f_in, f_out, codes) + +with open("useless.huff", "rb") as f_in, open("useless_decoded.map", "wb") as f_out: + decode_flow(f_in, f_out, nodes, root_idx, total_bytes) \ No newline at end of file