From fe15442a6d411a80c967546bbf41498073a0e38e Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Sat, 3 May 2025 17:25:38 +0200 Subject: [PATCH] handle duplicates and docx --- docker/workers/Dockerfile | 6 +- requirements-workers.txt | 1 + src/memory/common/db/models.py | 51 ++++++++++++- src/memory/common/extract.py | 44 +++++++++--- tests/data/sample.docx | Bin 0 -> 18770 bytes tests/memory/common/db/test_models.py | 45 ++++++++++++ tests/memory/common/test_extract.py | 100 ++++++++++++++++++++------ 7 files changed, 214 insertions(+), 33 deletions(-) create mode 100644 tests/data/sample.docx create mode 100644 tests/memory/common/db/test_models.py diff --git a/docker/workers/Dockerfile b/docker/workers/Dockerfile index 644eda7..132317e 100644 --- a/docker/workers/Dockerfile +++ b/docker/workers/Dockerfile @@ -9,7 +9,11 @@ COPY src/ ./src/ # Install dependencies RUN apt-get update && apt-get install -y \ - libpq-dev gcc && \ + libpq-dev gcc pandoc \ + texlive-full texlive-fonts-recommended texlive-plain-generic \ + # For optional LibreOffice support (uncomment if needed) + # libreoffice-writer \ + && \ pip install -e ".[workers]" && \ apt-get purge -y gcc && apt-get autoremove -y && rm -rf /var/lib/apt/lists/* diff --git a/requirements-workers.txt b/requirements-workers.txt index ae0428c..bed6f1a 100644 --- a/requirements-workers.txt +++ b/requirements-workers.txt @@ -1,3 +1,4 @@ celery==5.3.6 openai==1.25.0 pillow==10.3.0 +pypandoc==1.15.0 \ No newline at end of file diff --git a/src/memory/common/db/models.py b/src/memory/common/db/models.py index 8cc5e41..12cbe2d 100644 --- a/src/memory/common/db/models.py +++ b/src/memory/common/db/models.py @@ -22,11 +22,12 @@ from sqlalchemy import ( Numeric, String, Text, + event, func, ) from sqlalchemy.dialects.postgresql import BYTEA, JSONB, TSVECTOR from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, Session from memory.common import settings from memory.common.parsers.email import parse_email_message @@ -34,6 +35,50 @@ from memory.common.parsers.email import parse_email_message Base = declarative_base() +@event.listens_for(Session, "before_flush") +def handle_duplicate_sha256(session, flush_context, instances): + """ + Event listener that efficiently checks for duplicate sha256 values before flush + and removes items with duplicate sha256 from the session. + + Uses a single query to identify all duplicates rather than querying for each item. + """ + # Find all SourceItem objects being added + new_items = [obj for obj in session.new if isinstance(obj, SourceItem)] + if not new_items: + return + + items = {} + for item in new_items: + try: + if (sha256 := item.sha256) is None: + continue + + if sha256 in items: + session.expunge(item) + continue + + items[sha256] = item + except (AttributeError, TypeError): + continue + + if not new_items: + return + + # Query database for existing items with these sha256 values in a single query + existing_sha256s = set( + row[0] + for row in session.query(SourceItem.sha256).filter( + SourceItem.sha256.in_(items.keys()) + ) + ) + + # Remove objects with duplicate sha256 values from the session + for sha256 in existing_sha256s: + if sha256 in items: + session.expunge(items[sha256]) + + def clean_filename(filename: str) -> str: return re.sub(r"[^a-zA-Z0-9_]", "_", filename).strip("_") @@ -65,7 +110,7 @@ class Chunk(Base): @property def data(self) -> list[bytes | str | Image.Image]: - if not self.file_path: + if self.file_path is None: return [self.content] path = pathlib.Path(self.file_path) @@ -178,7 +223,7 @@ class MailMessage(SourceItem): "sender": self.sender, "recipients": self.recipients, "folder": self.folder, - "tags": self.tags, + "tags": self.tags + [self.sender] + self.recipients, "date": self.sent_at and self.sent_at.isoformat() or None, } diff --git a/src/memory/common/extract.py b/src/memory/common/extract.py index dadd343..514244d 100644 --- a/src/memory/common/extract.py +++ b/src/memory/common/extract.py @@ -2,15 +2,17 @@ from contextlib import contextmanager import io import pathlib import tempfile +import pypandoc import pymupdf # PyMuPDF from PIL import Image -from typing import Any, TypedDict, Generator - +from typing import Any, TypedDict, Generator, Sequence MulitmodalChunk = Image.Image | str + + class Page(TypedDict): - contents: list[MulitmodalChunk] + contents: Sequence[MulitmodalChunk] metadata: dict[str, Any] @@ -27,7 +29,7 @@ def as_file(content: bytes | str | pathlib.Path) -> Generator[pathlib.Path, None def page_to_image(page: pymupdf.Page) -> Image.Image: - pix = page.get_pixmap() + pix = page.get_pixmap() # type: ignore return Image.frombytes("RGB", [pix.width, pix.height], pix.samples) @@ -36,16 +38,37 @@ def doc_to_images(content: bytes | str | pathlib.Path) -> list[Page]: with pymupdf.open(file_path) as pdf: return [ { - "contents": page_to_image(page), + "contents": [page_to_image(page)], "metadata": { "page": page.number, "width": page.rect.width, "height": page.rect.height, - } - } for page in pdf.pages() + }, + } + for page in pdf.pages() ] +def docx_to_pdf( + docx_path: pathlib.Path, + output_path: pathlib.Path | None = None, +) -> pathlib.Path: + """Convert DOCX to PDF using pypandoc""" + if output_path is None: + output_path = docx_path.with_suffix(".pdf") + + pypandoc.convert_file(str(docx_path), "pdf", outputfile=str(output_path)) + + return output_path + + +def extract_docx(docx_path: pathlib.Path) -> list[Page]: + """Extract content from DOCX by converting to PDF first, then processing""" + with as_file(docx_path) as file_path: + pdf_path = docx_to_pdf(file_path) + return doc_to_images(pdf_path) + + def extract_image(content: bytes | str | pathlib.Path) -> list[Page]: if isinstance(content, pathlib.Path): image = Image.open(content) @@ -68,10 +91,15 @@ def extract_text(content: bytes | str | pathlib.Path) -> list[Page]: def extract_content(mime_type: str, content: bytes | str | pathlib.Path) -> list[Page]: if mime_type == "application/pdf": return doc_to_images(content) + if isinstance(content, pathlib.Path) and mime_type in [ + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/msword", + ]: + return extract_docx(content) if mime_type.startswith("text/"): return extract_text(content) if mime_type.startswith("image/"): return extract_image(content) - + # Return empty list for unknown mime types return [] diff --git a/tests/data/sample.docx b/tests/data/sample.docx new file mode 100644 index 0000000000000000000000000000000000000000..8487fc64c1f033991a36edab48d7d7d29744bf37 GIT binary patch literal 18770 zcmeIaWmH|uwk?diLy+JeT!Op1YjAgW4Q@da+}$C#JHdjxySo!C=(ot;`|Kol-*fJd zueI0Od$(w;1*^?5tLCaX$LOPvUL_|93Wf#*1qB5}>6)Vs^izWc{HrG}S^p8hA))Q)0rG7QFRE5qNQBSiG=|UPbui?qnzbe(munpvC z7WfMJ8hz9J`aQN#^6i?@*xf0?iHFHv)NqGlq$Di|47j9odagk#>awPyA%fc6^Mz2s z5YGN}a<19N;1Yw(YNl_5hbZ1_A^m_uqa&$bcgZZ4Bh> zZEPLr3~cO;XkD!=)1o`9JL%zqPCY^?Z&=JeGbq&xFObOyCWDzyTC15$)R9F*^Q|8z zjv(u7I?1rGiBh1p_Bg)38&+Fl|i^Rz$;;G!jlx5%B$}J66&% zm^AS2^SW|9OwG0~2xK&wK{20(g>q1(o(9sC&ihg`vI`!>poHi=g25~9-LO&em#EH}WIUo$1@-WTg48^cP+kG(cKy_Zu0KPD4nh|rWxGQy!|4dS)m+!w>B;# zp4}tOApXk~Eq3r)83F?VZT#CQLi%}%^lWW^%#i#bV1|%dZm6;gRX5lOks=ucl#~(Q zXGROy^s%Lg#?8ea6W!i(WQy$9@*EExjm&9Ran(n-fLNJRpz|OItb%qJX@}XbdmDbr znYsx#A~K0Ut2m?6eIRWm#-@2?s^Nd0>Q4{bUZpaO%-G@=pY)}_lOk}6rW)Y@r9Zon zolER#*{|XaOc%6YHF%UDk3wsfw?k^ik+-x%fAf8kmb3iGr)W6!X?obp+7wHu1L$b+ z!Q>Ka-=L2tz-koJQ>khdJk6jZtqH0W@Te4Sxwm+3Os@9 zp@KJK^UkS_k${0R^4#RRZxd~CdbucU@$!2eckF}pv@TBSkV0b!9CsJa2BYAqI>spY z>-~=MM&Ki&NeK5;Vw`%IHfN;exYF04`9X~{Pb@U7UTuy*$l5&=*km?}4ND9w8JVU! zc5pjp>fk*7cy937gd0==lOpqPCk6HANilG8aI~>{nU~sVPT4Mcv_KcHz-{OtrjLSm z3ellt#{oj}l8`J^b_-&axbv)8;ODD$I()&jTqJbYtxqH8=kTJ~@>%f~ij-J-<%0AZ zjEZg)s>ojIQfWLQ5Q#%-c_g3C;<33x%R7@)m}lke{i*qrp~`^|vJR6ihr!?CYQ)zm zyrIFIE7fi+elrkWKxqM%u%;l;WIcl^*IC4Ccl!mq?o83JtSXJOTGP^EFszH%GYkgU z2yeZfNo<0Nz*msmzxzCd)fwU`b2pNQ&`~O8f6e;hqkOAHKBhR?(IDzD*4fg9IwrbN zPZkW7aMy+I)FcOvNE!Ee3_8rBn^i}7WCfLdkT7Y z6_W2Lv*%MMH7Vk~Zx^NNWbP26kOJdLFg3Sac^{t;`4nn4rAd3KtjSHtP}PW_a8j{H zI!=S}d6o0#-W=8x(UFT*_XrhoylpHC8|)9hvC&SI4fs|yVq|9S#NMcAeX%+1!n>rs zMFtyJ-1-HswwXR5M)wdlp;Vc4IZVQ2hOPR7j!{u7aazKp_%X4@W^S;5;eJP-$3`bP zNo3l!jF6 zy|nF;teg?XtguyyW_$>V7?G(5$Xl5*Bss~L)o~NlXAhIj-iUUpxQ5obz6<*c?e`bF z22+MIqn5ArR}&XF;i70KU;8`erMiB$?y<>_ww+rHyOOQ-*b-0ed2CL3bmea!TKurv z3U^HqRxA0j*e4-a=}T8ndM40pA6#2Ruw-C`pJFQq5%ksomEo-L>%2pp_BiwHsI^Zr zrasd>9A05vvsJYEZ8kc$#E0=3uiNqV?j(zoZYVEQp984l`0Nql`lQWDY!_hFD0p9C zUDOSt>Wl2^vz0m6N+;i;`!@f(DYG-dLVTw)4 z0Vw4+seUp{9U@pTQtb*^W3H7GAaC`Evssu$ViIfAD{lLv_~bg{dIT0QWhMENSAecr`efCL2!KeUbG}?szj(Z-d50=lcHzU@^rcdb?9En6U$&>kJU#jd)ulWTb zBRENWJ#UrF63Q{*tP*@RnK@54s{#Trz zT_gfTnxTV;WI2=8`%>gsIwTRXB&R}WH zKo219o#6vRN+^Lpglk(a63bMeTi3|;1=5azV-#H8F=S$&LcI}kAVKXY&E}oT8gZvVc$DbglaOC_Q)m{! z-#WhW-N~<6fT;va+vgFdmPD)UW4P(uK14m4-~C<(yK7a<`~A>i8#X6{V0T^dN!s?! zsr_-PAKfC0tA4&ObE;)cagT_)v|Y}j)NWP~RSRh|bhkt4=UA%6C|k*`H57-Mpo|ZQ zXN1Q_7$yCZ_3XJfA(*Sg+w+b)@*Fw-1&akC9kS5K>*jXh1S}LM8~g#5$sc=owsQ^0 zDL#*?L&eXZq`2ET-i0^ciBH4uFIli{L+0*iPKrlP$Y3;R1!dg|vxmoY9}(W1JK(uH zzB|pS+12Dq&m%5{pZ-*Hky-Ov-0z*uiM%oQ;Ra=RjjfTyK%#lc`^iS0i3R-YzN^vZ zefL17be#Gp3oqWm<#z4DiFfx&jye15o{#Mb>CVLdgiX7E@N9V;hR=zAuQ98&nf3 zZ{XKEs)va8&z{$_D?aT`LG(xlI36Zk44>iUwJU^?t2f0gtQ16&lNFeGeB38MXlf30 zd^MIA#qVKUP2kwp>VfH4IO}wJxs`xKkn1U6X6Jezo#T*q*p*0ZK%4UQ(Xx5{@&slV zJ7RK*d0YMaaozyLla8cB{bIt*UNKE4RM$l90dv};_kK;R;iQfQse_TXVxqz{*7x9c zi#4D|b#b8+p*h$z9hoC1uT*NWPn^v&XW1@Qa{J0APA9wH%nI-JVCLDtXv~R1yCWSKyc=d+S)b*e5ul}< z?i_2DTYDKxkLpgpdTdB4IcwczpjciT7{p{Rf8SDbnqZac)O6V5vZ{N}<>x{`^^D3? zg}pmgb*)(z%gLz(|7BJ z*oE|9>4y^ozkCL{!1VIw-K7xm*FjMFsf^v3y?-6e5DxvfX$O)&;IQzhvO7$D98aSl z%uVW{A0Pd6`bw&9*M1~_ytja^UXK4tPxeQ@e;EHbQl#m?AjIx04ls^t`k%-4!=Mal zGn^2D+xUk$pgTROmtp^8j;SB+INgi1xkmN28`DJZzl=|d@aqj6!ydz5XNEKI`|Ild z8j3;gXCA=i>m(6=WrO`|sDFMJ~)#vJwf4FX{R!XM%4{aBZVnAy2a`!BYwp z7JQCe`tZb7Ov}$bG|%a{Q}pdME<`&BR!6kDL(-gsK_!P2@WydR8!m70t|>}w5+7tD zGfgDNS}^y8n0m;%Sf|J&WiPJ^1#rTSDbU--@Tjl3*w3a?#(nQn22wv@UMbQ#&(1fr z#6@W)NwWBGq{%cJn2_WV95t_1#O?A+W2RA$t1rEhz4A+oFnHfy5$X|S&|0ab%#kmO%Jfaj3^`#?K>B3s`>V-CRl2e9%>>a;-rVjNTYKkpWP8#$;f{ML z_-zln^Co+z7_04WcULHKx&BYL-@mPT-z5h+BshPK2Wi&cx3#G01l!Mc+lXf!@#Yz0JqoIJb?# zE0IBe6xBdnda9m_SFm-M-NH_fD%b8{6zB`H z>w6w^J7csOVT7u!Nb^LjB-HrxOL(sMIaUqd_xUlaUnkyDwy`2x@8}Xi12YnZ=ZyFW zMIpPYN->fxb;@B3k5>(odBtmrHQA7~N3hTK*WY?Ac6piB`9q{5+_EhMoA#OpOIFKp zrFb0RBEUlq=4|nhUVz1I;o-XVbmTdI6*y#^r_EY`fLE!57s;|IX|&=q@_JmZMr_fu z6!2i)QE!u`CLLi7X2Afx)4L9$;X-hR+8T4SV6%bWEnN>+@0vKUOAX4Lecr-jY&IF9 z#e16Y=#6BoBPS1E`nbbQ0ulzHH7BmJk(Vgy{tV6SDs@Fi!w1BG!lupVHz-?ldCi66 z=ed~t7FW1vYM&Fv4UDxKD?-gt0?GS*=31+3b^d!YvGaG<-}tL^Xm+9WmrXzvlqDn* zKzfQppFhNX*4F-P?+n6Q@iY4&u`BM((F*AD1MxZv(MSpEW<(QT5j z`PaR@(IMP6wjp|6>kZ}g&UfbA(QQBr7A#(<7}kEVE66b z9At?um-$q(=CC9NWJ0{LKvML^Ry2+VEhQ(hM~zR0gCMS3$|ELubBVSOE#SO0rS~i! zpw$m=Z+9U3{UDFTAhU{XC~=V-GT=(d>@(Gy4#w4Yz_tc~J@Y6`~ z{PQhnNm-&p@%{oua#^i&d!@>;g8guE@EiD+M z#Hkxp^r^l9;Ew9ynOp38Q({QbcG`QTkB1hajC_wk8?aEcYl8?|;N^KWaxi599<0qd zlJq&S2L@9I_Tf7u+Ex1VEX1_oJtBV&4P&@ZW`)7dq@mh%q(NGc>aQBf6+c0oQy@FyU?jAD~RU?^kL-Bv)bw#o`N+4ulw=9xIyX9^<*Z8!6b;<*gFYstaeOiG;wT zuuuopE_*jy-bO@LzB5de0<&h(oaey(T` zWYXcQGj{9<=WxdZKXdO#UJ>+twVXxMj(8jqA7?%$VINvgIwKorkF02$1N${;*}d0A z;%+dan{V$23uB3hd+aLTnKE948x){>wPrWPL!#dE{t)0J+0YPM5T zrzbl1M8td2YAdv6<{Wz9Pr6ZW`A?}JgQ%!-+#x6%@SP8UjCq*(9=mC}uOyc63S{Tt zXLRf(Yrn*|#div@-}TQ1g1L-yKfYs-CL|`LY2|gI^e48Tq}Ax#jY2K1MTdoPd}T=QwZNHs=N3q zBfXBizLZI#?1)x0ry=h!z0+=q-UgTsOWdvU-5B`MV{+vguh^#hrM8n?Um6mGgy^VBO)In zf{(h6l^mf=bKxZHy1Zcv$kBM6q}k=&(ar5@uj#+eSA zbk-yDX3M$EkWVIS%jH(4CZ*@Aa*aoQ-ZPHgFmtX{>*NEo#W;Kr(y=L362g)xKcQ-2 zdyrG8!&)ktw4yFzecH~D*~>qZ+{G(xX70$16=M}*A3Zl!-I2j#%1un8G6@bo$T^SuEu_Y>YsAaBMUhl zXw+D4GPhY!4aE?-7?{8~`wn@vVAqobqV1Ax`PL*)qSt`8+Bk`3;%Q#Zg~Ly8UBivz zwae&I1yj|gyJ~Mnbn>THGnYNZ1|97(A@5*xy>txtA<5>+8}x=ar3GcQ=y;aESCQkR zHo2q1iVWVJG3B89X$%a=bz-^MCqyjqDt&CsqTN<83xOw!ZZ^DiqR07kMlXO*g6gEZ z)~sfDl0Zd#QcW7%vav9_9t#JAr?S4a*nQ$wsWLOX5iF%fIpa)6h0s|Cu{`+drZ@%;G4`i+?88Zd&ox7@qu@=V1ZlkY_ zDRiqVlr{K{tsC;0s<_apBn6^(JIV`4=hGn#-vUQVHq4cZdLLLjd0X+muPXvmXE11t zc39$-WYCW;P95CFdA9SOPSG>OjbcZZsA6AX49n1ZxNuC1Yvj+Qw#wgWU}3DTF??0o z8mgF6LaF(wm{L)=gTg|)VVvGUlcS)T#4L6s))-|5#)xu5*EC*SsZpt>EYY!_K$`r6 zo*x_ibiC#wy4BU(_s_#Zo&k7;8mAEK9L7r1I5f0MJoXMksI zTJvkRvh_4k^1ghpg4=tvcy@hJ!24tZ@Nx1&VU>M#%JfH^diKz%Kdn6<`kI}S4g4UtX zxB@vRRl|k1*DO}saYCAW8KVC}0`^*Fo*pHXr-Ccg@UyuKjSeGWCm6Co=_*JJbaoeV z6s5`$^XA~h@HrTy1BXxzw7bT$RA;nGyobHp@;>1Dhbze>gBEdZ!a5mU#xtaQC%>Ty z)bK7M5Dx+G-J|lE3`_^XNevJ9`zVxmUhykJFzmz%4wAgB&(f_jq!}_69|Ne5RWH!W86phTN zk?F#P@pe5dO|EZQv&T18zyI{Cb&n-1qTKZPMB!DJrmW&Eic8+_j>dwuQsj{n}-#iqk&AEG!**=<4ZM@FS9a2Y62+cPqgW&p*89 z$H`6Zx!$R_)UL^IAv;Qytgv3C^4t!JJmD7;bTVexf2RHlAy4Yb_`rc9{napF9J0p< z74z0QzVUf`py=6>Vy&X^U<3CtcO_%$zCCy45oWJo9raaXDa0y?XA#7lm<=-FnU?(* zzkbzc^^XWN4i4v7FW%E(F=+q@{6Be5l)sRtv5mE(lAgZhk79XgjPy$>R^)~|aQnTT zuDmibaF&XeGBphMmu{&(v8NAQ)(X}-sC>l>N&=*uy>FjZZYk2~cuSFKAjnQ(*wYLZ zGAF%6t#$ftjt3ue&~Q4AzqkemauEz}Z&WpfG(FRZ0=tpNp=*I^)@ zrrVyG;!o3xWjOrRxKv?~-uz5`3HckZe7TbTo>3S)M6o+%VuQ@IzI6*M z7+2Hy*T5~40SH;`Yw_m3H1BBw1MM$k%3+~TvOVr9^^*le1k~NV50oI} z^WH@YnAc>2qhcw&ox{uu$GT_x6iv`-o|mH1bQ7(Yw{a=+ns+0bYn(d7m5R!PMq6>@ z5o|}ZY!x`huH*sPmr~`D#%Qp!Qwhb*LtU)qJXGT*{4qK@h~3$3ky6J@ZG<>5fyvyt zWza$J*v*CmOWGK^+tj-b3nCG8?1tP&COBrxZRkcDG#cat2>DExUOGPp_?G`ml(;v& z=)IO zpkF((H;c!4K?3_eXaI#>56>*5$MWK4O6xOYW?IzgHcjjcve}8WHK9d#Q zqhDTVtg(Q7X*!>o9kO+^F%nwmC_ut`R|4V)&D<;aRH#|aQd7LyMdN;yVJ0ppw#X&9 zJ)fqj11THVhIG;wo94|H%++j#j2%MmWUd*^<$B%ZlDij(TJXTNy>HKicg<8YZ{$~k z0z1CYJD8HU2&+gCAFlf(%*SZ7ay-=Z*T-oCz7I%v_C>d@KsU?MO8j!zflIjutyG?gFz#)#c@|KiMWu)vM01UWi=XG#q6$PnlZgoW z_QC>rA<7zT5h_dsO4Y*`Ke<)$ zF38=uWaga?xCIy@(an<~cu8W;G8d-@ilXoD^rUr4IM5#s148nEc5ax_9d)DM z`(b}=nPl3Uy4vW{IwRD>VqZaROymdxmwjta*pOEON17T%?lRx^D72l3eV||+7c@se z)F>IG1KD*Bg-CqK&nOQG!ILVy^VG<=(WYU~MqRjIE6Q7RLw@x{SLD(T;{M@TNZ9%F z7gk`MVmy{H4Zbr^4Eze+hw0V$9KkTD3~z{+h$ZP@@!=(6f&W{)kN8)_a&UCBG;(-x zp3ADM4jU{;UZ*b>5L6W^QsHTl}KSg)$W!n0f&EitoZc-M9m%{GQ2NAdZp{iK6~Gf zvG@jE3e&KE&#Nbp1U9Hym}{Npg!xr}c?Ah}ejL^S`F@bK;d8#9#VbouA=22d>jCsl zkxW8^?nBd+;2{$R!u?SU@^+Ya;5#9B$cAVgOG&r-L1l@&lb5EIGf4L*`Q(g|)2nRj z*m`zsIPQZ&(^lbUarMBb+@l{^<&x2%$Pqpq4Syx0w;vL6YP-1WYjcg+MG0hBF}157 zo&6$-qi+3$Cb1d(v!qbOX5~P)Y3r!nBzu%0sdyU;X6YeMM3*Qrq_LD4@`sIj(P43j zr(x@8BR#LGNN&4oNJDX9O^*Z80BW0uc@0wvP7#ai4jLUlRNK(&sur`T?P0eAbKcPCwmUUU1Kk#K z+0BQI%XBp_1o6|v=HnwCxjgJ4bp!08UIm_xcL|zLG*i(15i;1Ot&g!V>Wrg0pAlVy zxvUwbz^9{J$R=s~nopFndQOF=neCRdKOCN}AEp;nSX3y#TMhq=@ABYKjKx%^_-?ff zsOIK%MVH3um?hjDD=%wG==z7U4MVG>Nw{`QJUwjBS6J(Dl#?6mu>t4>>uuI$`&RUO ztl+AzpZzJ7>Oe}j_r@q%OlqGy7SmHk3s2r_+&tTF=0|ka`4=b>O0mw*82iH63alAx zYF3zQJn-b=0%^!k-(2rR=A-x~j*@>l80Rx?<=^-6UTJ8Bl~VM405(erQN)kz$eK1S zzUc}cynnJzT`*LHmm|i)Wj;>2buWXbgA5x(FCoCjC;Kvop%5G9g@z+SYMFB@geSLHd^`j2{_=@ifu2>_bm0+eq5V|d2$Pw?!2$=Ls! z$=LUXE!;FD3$8t-9$&r0#k+8tTdd-C)#Izh)=49haZfLn()Ku!|3rD}5r_WIxaY!CGG~FWY}aXD9ra;Iw99U4cuPc4$NA9<(EN zO<>aicqVYhdX`!MfM-;Hz_W*)2j8YY;hEVl@GSNV?U+?80G`G6X7T+E&(O`FT7Q9O z>?W!AT@8E|E*Bk+M?3Gwu)|un*ogL)CUIyjCX-YBSI#xX0_mIuDC!+ZLET}5+q+z{ zm#JCG@|T-wo_60@lo1*or0tZug;~=Fn2KulJFbbq!ZZpzGM6Z(f*vxtC4+c395dCY zXY$Qt)wF8rR$l7RT%921GW40@RAi_7JAGZp?j#t;Qxk`8%Cm5!lceTa$fMU$`+#Ux zHOuL%{9HP@CP8aWpputTsrMwBvAmMPn)&wDBL)Z6mAgR@T-x{G)fC~~sU=Q2Qk{iS zaaTnTPtIlz)2AC4pj>b!{Y()r8Ufyk2wiNwaj2}rVImFHZ={aIO^aY;XQZ)s= zz%z({3(tO|W0`*Qo@0Pg7NR60Dtk<~(+1Y3O}iycP9&(%5Pl$FKJJUdY9O(LC?@5Y znuHwzTD*Hbm&$vZ@0auWD5gE(+Y%$^nHl=sB+BdwgX~Y6>&2{HeH=s%M6rerEk@|P z&9~e898?bed}<3)!**7N#%BGL`W~+eMAVcU@RXJFL;VzIT5NOlb{ZUIOvESNT*dtZ zA4~r$A6xrPHO};=RF>VTRH`f@dq_QmpLC<~)q0gbRxUZ)ksSy8`+@#O!oyZ#MEJ(= zFixR+EJG6!{6F|un@GI%R+-nHloh)|-g$|x@)OM|#>kPyCR<5I~VN!_xz7qU(z89gHO7sbvL#K7c zQ={#W*v6bBDQ>JmqNf_AEe4?O&ddtOh%&)=JDJNE!a*DzKadGd)i{)u+#wQfZ@t`w zQy}j?F3y_fh)*&|MKhOHrALDH-p~!LUuF74z0FV{0GcR3y&aegRVbL4NV#Or;%h=) z$|e@Vj$?=^>Sf(iQUGZw_085r(flMSTf;p#b8&;pH4^yyR5rAHN^#-A)X!|5S2 zPi278A`13r8CJpJ`=A>@ggX4ksneuI*C4?~^Sj2~7Tmi+{p4>vmR=idbxsEC>hInx zAiak@!V_)Gc7d>i`i4d1f_#r-Z#&w0{S5>L1yy9cX3oLwMLrc6+ko2i!Ui$ZnGe_BwFQO`(TM z{>W}`0okpE27V9?G*%i;o?F$A>=qvMZKwrSbCenQEKHaRKqc@|OL2HKOB7?N_Ucts zGXGk^)=PF9Wk;+_UwK$6KI<#^w#^Trf#8xMmZSiX-G1XjD_nB_JG))gCROsj)1Y+k z>Tu;qk09Hu22``sLma(p^qzLV6pqEhhAW{r(I+rDZg-Yjzpt%7rXq&@EOb1Fdiwy# zZoh8~0hRMkm!&yKn8b)Ds(o9En?HMK;pOz#!%}y1X4%iU*E3Qcy(!aXBdx|a*Pn|~ z9`*HZ#jJ$;(%ClShbMLM$+k=Aav>=7yS8ToY#T97L9a)}ry>(THOmCSPorpok&^BG zWkSRQuM&TN5zn@YS~z^R#4Vtj#gv|k40Y%6pZHkJ+<)d{G5+FXpc@3u;Qz?S z0(|QqN&p8VM@KVjlOOr5ksQBF7av^9w@}kl$4MCA&}0=&N(5VTH=ky^g}AE2=>;>W z@?NVDz71m!V=lb%`iv|p+QfOql3{!KWRRSkuf2*^JP6JACW;k&mYF#rwOCQZ&JOi> zlp@GRc3Id&s)AM{L`7Bc%UGaiekdY6>I1++i=I`W+hj)eeI*lP^U7*G3qU-{5oHfV zFgZDur=|UO=#p zVxLm->ALA5D!N=oynZU&J+*H8L9sfCT(d*HP^^I8|1pMV{NE__|6UYg_&*SZp#Bww zLjE@j0XWwGjY9vwM4>1Kk<(WR(l45dou>q{pRVlUu%9&*x{*P$N?UcWJ!8)D zUh5LK3I}{~=?8`-(|LH&R16LGkiSGBuS(v6{~U#I{)$3>tTX<3@!5|x$F)!w77TFX6I1kGPYq%#y*{zte7UJpw0APQfHpr=9g1ELM+xINFSB}T$1ue z3bt=wNz@v_=;phlcpIp^f8&K>+UQ`LE7!LyoM_2%z-~TuyvHovJ`hP7X`4=%DTsuaZqN3IRnHU{iG%2?IV`3EYPl=Hj zOathDpBVZ7oETmHnHYt9(+zl0s+hefReHu#D|UYQ_P#wXvGvX5 zW2vn1*FCWDpg#)Rx;8$D&U5>I8!c1e<)Q_ekfHC|4FGL`a`KAI^y{j zp!3r~`A4P79YCo<_q$T%wd_bAX3kCMAQ}DHHnqd^WSrvjy22Md`RN|cJMmUP>Caod z_5KyTSQjg$y#{=n_^lFrMbrwBz*_T`4|ojSQ#}r@`p(V<>Pt-GyA^*2@*jz%m(}SS zf&i5YU=_5ElAEoOgVu`+?TMFG1V~kaP9H*L$`Rl*vo~vO&o&Ej&W^erC3A**Nr2Z^ zo3r4$bOVWW>C((xR&VB)nWQ(@HPJfY0F)A$9_v8>uvoqVExGtDhnM+kP!~TH9v#yh zkVBqy=MwvS(GVkcD0AXGeti-}5xL-cV*w_k3`daM+5Jms#R*?Wt6lC+{s$neG;~_m z)XmN4unYPIx^G;R&_y%gCR2Kew{ME3u-+UYtV`uI7;R@6hitxw>LG4R94GWU?Q7`gP8?i3YOgDh98hK8E*nPZh)0$`y5G>;Z>g zgLId(fjrzj_cf{?y}C}HO%PsK%D-DTU|WQLng6*P#qZWXuh)OM75{E(fQr(;tpDCN;rA2#yx8p5 z(H{Zs{r|Ga|9A7B7ZSXzNc+3l0xIJFxIXRoQ~a!_{q+l&1f22r^#{LO|Ewqb)fy4| z7waDi+~2K#Rz|&O&i-!F1b zjP`%+oP_n4&VLXjzgz!|YhN;gzncNqpVt3Pe(?L5e)jOcj-#IYmvQ_+#lKts>`Z=r e^lk+HF($}Kf&-v95D*IB#~$!>4hX&c_x}M;pX2BN literal 0 HcmV?d00001 diff --git a/tests/memory/common/db/test_models.py b/tests/memory/common/db/test_models.py new file mode 100644 index 0000000..60477ed --- /dev/null +++ b/tests/memory/common/db/test_models.py @@ -0,0 +1,45 @@ +from memory.common.db.models import SourceItem +from sqlalchemy.orm import Session + + +def test_unique_source_items_same_commit(db_session: Session): + source_item1 = SourceItem(sha256=b"1234567890", content="test1", modality="email") + source_item2 = SourceItem(sha256=b"1234567890", content="test2", modality="email") + source_item3 = SourceItem(sha256=b"1234567891", content="test3", modality="email") + db_session.add(source_item1) + db_session.add(source_item2) + db_session.add(source_item3) + db_session.commit() + + assert db_session.query(SourceItem.sha256, SourceItem.content).all() == [ + (b"1234567890", "test1"), + (b"1234567891", "test3"), + ] + + +def test_unique_source_items_previous_commit(db_session: Session): + db_session.add_all( + [ + SourceItem(sha256=b"1234567890", content="test1", modality="email"), + SourceItem(sha256=b"1234567891", content="test2", modality="email"), + SourceItem(sha256=b"1234567892", content="test3", modality="email"), + ] + ) + db_session.commit() + + db_session.add_all( + [ + SourceItem(sha256=b"1234567890", content="test4", modality="email"), + SourceItem(sha256=b"1234567893", content="test5", modality="email"), + SourceItem(sha256=b"1234567894", content="test6", modality="email"), + ] + ) + db_session.commit() + + assert db_session.query(SourceItem.sha256, SourceItem.content).all() == [ + (b"1234567890", "test1"), + (b"1234567891", "test2"), + (b"1234567892", "test3"), + (b"1234567893", "test5"), + (b"1234567894", "test6"), + ] diff --git a/tests/memory/common/test_extract.py b/tests/memory/common/test_extract.py index 69a0c00..41a5da5 100644 --- a/tests/memory/common/test_extract.py +++ b/tests/memory/common/test_extract.py @@ -3,16 +3,32 @@ import pytest import pymupdf from PIL import Image import io -from memory.common.extract import as_file, extract_text, extract_content, Page, doc_to_images, extract_image +import shutil +from memory.common.extract import ( + as_file, + extract_text, + extract_content, + Page, + doc_to_images, + extract_image, + docx_to_pdf, + extract_docx, +) REGULAMIN = pathlib.Path(__file__).parent.parent.parent / "data" / "regulamin.pdf" +SAMPLE_DOCX = pathlib.Path(__file__).parent.parent.parent / "data" / "sample.docx" + + +# Helper to check if pdflatex is available +def is_pdflatex_available(): + return shutil.which("pdflatex") is not None def test_as_file_with_path(tmp_path): test_path = tmp_path / "test.txt" test_path.write_text("test content") - + with as_file(test_path) as path: assert path == test_path assert path.read_text() == "test content" @@ -35,7 +51,7 @@ def test_as_file_with_str(): [ ("simple text", [{"contents": ["simple text"], "metadata": {}}]), (b"bytes text", [{"contents": ["bytes text"], "metadata": {}}]), - ] + ], ) def test_extract_text(input_content, expected): assert extract_text(input_content) == expected @@ -44,19 +60,21 @@ def test_extract_text(input_content, expected): def test_extract_text_with_path(tmp_path): test_file = tmp_path / "test.txt" test_file.write_text("file text content") - - assert extract_text(test_file) == [{"contents": ["file text content"], "metadata": {}}] + + assert extract_text(test_file) == [ + {"contents": ["file text content"], "metadata": {}} + ] def test_doc_to_images(): result = doc_to_images(REGULAMIN) - + assert len(result) == 2 with pymupdf.open(REGULAMIN) as pdf: for page, pdf_page in zip(result, pdf.pages()): pix = pdf_page.get_pixmap() img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) - assert page["contents"] == img + assert page["contents"] == [img] assert page["metadata"] == { "page": pdf_page.number, "width": pdf_page.rect.width, @@ -65,22 +83,22 @@ def test_doc_to_images(): def test_extract_image_with_path(tmp_path): - img = Image.new('RGB', (100, 100), color='red') + img = Image.new("RGB", (100, 100), color="red") img_path = tmp_path / "test.png" img.save(img_path) - page, = extract_image(img_path) + (page,) = extract_image(img_path) assert page["contents"][0].tobytes() == img.convert("RGB").tobytes() assert page["metadata"] == {} def test_extract_image_with_bytes(): - img = Image.new('RGB', (100, 100), color='blue') + img = Image.new("RGB", (100, 100), color="blue") buffer = io.BytesIO() - img.save(buffer, format='PNG') + img.save(buffer, format="PNG") img_bytes = buffer.getvalue() - - page, = extract_image(img_bytes) + + (page,) = extract_image(img_bytes) assert page["contents"][0].tobytes() == img.convert("RGB").tobytes() assert page["metadata"] == {} @@ -97,17 +115,23 @@ def test_extract_image_with_str(): ("text/html", "content"), ("text/markdown", "# Heading"), ("text/csv", "a,b,c"), - ] + ], ) def test_extract_content_different_text_types(mime_type, content): - assert extract_content(mime_type, content) == [{"contents": [content], "metadata": {}}] + assert extract_content(mime_type, content) == [ + {"contents": [content], "metadata": {}} + ] def test_extract_content_pdf(): result = extract_content("application/pdf", REGULAMIN) - + assert len(result) == 2 - assert all(isinstance(page["contents"], Image.Image) for page in result) + assert all( + isinstance(page["contents"], list) + and all(isinstance(c, Image.Image) for c in page["contents"]) + for page in result + ) assert all("page" in page["metadata"] for page in result) assert all("width" in page["metadata"] for page in result) assert all("height" in page["metadata"] for page in result) @@ -115,12 +139,12 @@ def test_extract_content_pdf(): def test_extract_content_image(tmp_path): # Create a test image - img = Image.new('RGB', (100, 100), color='red') + img = Image.new("RGB", (100, 100), color="red") img_path = tmp_path / "test_img.png" img.save(img_path) - - result, = extract_content("image/png", img_path) - + + (result,) = extract_content("image/png", img_path) + assert isinstance(result["contents"][0], Image.Image) assert result["contents"][0].size == (100, 100) assert result["metadata"] == {} @@ -128,3 +152,37 @@ def test_extract_content_image(tmp_path): def test_extract_content_unsupported_type(): assert extract_content("unsupported/type", "content") == [] + + +@pytest.mark.skipif(not is_pdflatex_available(), reason="pdflatex not installed") +def test_docx_to_pdf(tmp_path): + output_path = tmp_path / "output.pdf" + result_path = docx_to_pdf(SAMPLE_DOCX, output_path) + + assert result_path == output_path + assert result_path.exists() + assert result_path.suffix == ".pdf" + + # Verify the PDF is valid by opening it + with pymupdf.open(result_path) as pdf: + assert pdf.page_count > 0 + + +@pytest.mark.skipif(not is_pdflatex_available(), reason="pdflatex not installed") +def test_docx_to_pdf_default_output(): + # Test with default output path + result_path = docx_to_pdf(SAMPLE_DOCX) + + assert result_path == SAMPLE_DOCX.with_suffix(".pdf") + assert result_path.exists() + + +@pytest.mark.skipif(not is_pdflatex_available(), reason="pdflatex not installed") +def test_extract_docx(): + pages = extract_docx(SAMPLE_DOCX) + + assert len(pages) > 0 + assert all(isinstance(page, dict) for page in pages) + assert all("contents" in page for page in pages) + assert all("metadata" in page for page in pages) + assert all(isinstance(page["contents"][0], Image.Image) for page in pages)