From c5f07c028d803bfcde1895b1ad81f9525f860caf Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Tue, 31 Oct 2023 10:01:12 +0000 Subject: [PATCH 1/6] Add support to unstructrued --- autogen/retrieve_utils.py | 38 ++++++++++++++++++++++++++++++++++- test/test_files/example.docx | Bin 0 -> 13989 bytes test/test_retrieve_utils.py | 38 ++++++++++++++++++++++++++++++++--- 3 files changed, 72 insertions(+), 4 deletions(-) create mode 100644 test/test_files/example.docx diff --git a/autogen/retrieve_utils.py b/autogen/retrieve_utils.py index bc4fdfb75976..217caa5949eb 100644 --- a/autogen/retrieve_utils.py +++ b/autogen/retrieve_utils.py @@ -15,6 +15,13 @@ import pypdf from autogen.token_count_utils import count_token +try: + from unstructured.partition.auto import partition + + HAS_UNSTRUCTURED = True +except ImportError: + HAS_UNSTRUCTURED = False + logger = logging.getLogger(__name__) TEXT_FORMATS = [ "txt", @@ -33,6 +40,32 @@ "yml", "pdf", ] +UNSTRUCTURED_FORMATS = [ + "eml", + "html", + "json", + "md", + "msg", + "rst", + "rtf", + "txt", + "xml", + "csv", + "doc", + "docx", + "epub", + "odt", + "pdf", + "ppt", + "pptx", + "tsv", + "xlsx", + "jpeg", + "png", +] +if HAS_UNSTRUCTURED: + TEXT_FORMATS += UNSTRUCTURED_FORMATS + TEXT_FORMATS = list(set(TEXT_FORMATS)) VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"}) @@ -123,7 +156,10 @@ def split_files_to_chunks( _, file_extension = os.path.splitext(file) file_extension = file_extension.lower() - if file_extension == ".pdf": + if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS: + text = partition(file) + text = "\n".join([t.text for t in text]) if len(text) > 0 else "" + elif file_extension == ".pdf": text = extract_text_from_pdf(file) else: # For non-PDF text-based files with open(file, "r", encoding="utf-8", errors="ignore") as f: diff --git a/test/test_files/example.docx b/test/test_files/example.docx new file mode 100644 index 0000000000000000000000000000000000000000..f377c63c12986bc872f2b22d5bc316c941fb341d GIT binary patch literal 13989 zcmeHuWpo_LlCH>NW|qaw%q&^V%*@P^Ww9(~X0Vu<$zos5* zz163)Ao7dq%8tm0%=}7T3KR?#2pk9!2ndJ}$ah-1*8>;`DDDFg5DE|^h?cOOt+R=( zv!1ety@`_!ox6?o$2>3)iX0#iK>h!0|BG9oK4H|dmjO}qKIkoAy0K5Pm7<_;I1hp1 z=LZ>neWFMdB@N`6-P?P-!B9$4`ZQys7_;*ZGh?asw|16cRAHKoaS?KxaIgx;*6Mue z&v03qO+=w!ssRq@%r)H=E+1O{?sRoKof`qKRJ*T?iPyhUV@%OjKFL`I3e0cG{NLG68R-3jsUm$X2|?&uEf< zE7!j3bJ*HJV~1>E%q_J>P(Rx6uoB8;eDoEtlXpzcXvryXwN$KSTDPh5o89xst)5o1 z<>Lrqyv|A{v!NnIbsg_YB%b-M&%-Nv*2a$?s%{VOn|Pxehy~9-DRwpbnf}v63n(Tw z53iU}2ZsLxnBnUw?s!f8GTLw^LC@n5IoFHP>1}Mq94-EmQI6rg-Xclu?evX-3y%5R zSF_s=^Ad&5`%6#m-&<#`gXW3MBtGk9|%x5hnn6 z><@rL!T>y0&(Xx%iJtDa@;{#Xzt}zh^62GpJ(8e|h`|^B7l@7*6Rv1TM~xUr=%^yc zpb^8TY|X=gB$m&2g)@27XEValj|bU4omfm-$ulp?*ANwawg{uzcg+G?YVd%e4= zVJTAD)ihbZY5924WZKtn7=(T#etv@7dzX$oIp@!XX! zmXMT9JV3YXf-?}9MDwk%u)Rt99e0irtZlXO53}aKk|x7P0BMptrv2>ux<(K;yc$9p z9HmIhT6RDqE^rZ!OS{FO;KDV1Wz(Z56bU8hkmHb5VFYXLIe`L)*TtfT%W{~>Bj)5C zIDn1D;g}BH@5)XFs!W zo_!Pm0UN&qH4jY+Dfs07?bN-iQ&{raB(bp8zwt{<|0piJSupe5S#ncGF0DcuG~dm4 z-5-SCjbi1qCffPCDk{nGXTWd3n0toDR#DEpQ{SIP&ur5|nunm+ArZCZ+?cXFQW+hO zpSdPJMY+%y)gWtHOVgvb(6ixSqxg_3Z;K0JHnW$ANY@A7o-=M5o~blHRN&DnxI=tL z=eXQ~r54_Zinuf!n(DeDmyw?X2IG$SPFCu}Xt}FZft@2rj%Ysr5w0zEV|lF@hte`` zVExl8nZ-=x9Ar^D)(;_M_yC+-N9&vtMgGBKFTyHdnwngzk>X`gX3Tj^>BoC`Y`;0b znEUu}C?kFhD1+B;g(0bG6nquCRS4Irw-%IouXrSoR!uZCZaeYpz4av~Dmbf&+u>9- z9A%GT4AuERxbm-K@UAOaFvl#1?ubFHom3}RhpYD#p4;XcDc0@58Wp;(btWXo`N@I> zVeB;f1oI-H53Cl_b`g69gSb=o`arKy4m2;`dfrjiCr_Y-t$ za6MV9mzW^l%EzDt$jOH3@M6fy=n$i@s|6$fhpBN)07=@OVqN$h(gV5Od%|65(co7jj-Z&g`Kp3t1;g_EaxRP3cH|Un=uMK74fbuSH;f*TlRMc&M<$xjTB= zOxC^LRbEK*EG3OH(5A`@T|u^sDi%v2&ah^Tmhg{2ge4W3O=v^^uCHg4V5!N?49@dA73t-! zh2PfR@`6|77cWoGg)ZWu6Z3F1v=1p-l43ix<#$u;)!E9ZbaY-MFr*&>sirhN9{9r$ za6WVgeHZzt8pr(byaAa!`h_L+MQ>w@1p>wHj*g>@#2~Vjj-rl0&5X`hV(BCJRYmz` zM#vJ9$%pcrN<>rXCC`iYCE5kn9!^kNKD>k8{>6{j!EOghaObwcWy9_+rpxvWt!)Uc zny#`fggH?S(`Za0IMv;WxBR;x!+jgNC!QAr{$w>CHC|5|Xib?vZy{iN;3y*U{FD&F zY&bHy+v-xJvul_`e8V+4D2gc3OmjRrptPN6Vf%I*Qewx4&Bg46evEKOX*&@Ia#|Dy zIgDGjsCQGY&odeyG}}IOevxm~md4uK4NAkTKaE~1*0Y=LNMh~f{}MO(4v4}39-$YE z>lA(hdJ;xZARt^INZ{Xll>Z9Tf9+TP6|#W=$u%IJ|L@+)zThahmjHTb=9)bBmbsD#{shL_CbB@>-;FFTt-m z7eusDvpT_}!i?3|($%%jnzEFYn!EKA$>UI*V*J{WV9bTfV#1drMIRX*u+K7}3T~t8 z67HCI;Kn42C7;F6U#W`dcFP)K3Hw0MXqUF}BY@owl#5S^kl+42V(f@RQXNmr_(ME{ zlt|2hsH#-h6hXKVQ;|GPX?K2qNw+~(S(i?2)-2syBf-E$b&6B7#!T*26m>@vN_6%i z1)&_D=ObXO_3u_==KE4V0S*K-{t*ZW4PXqvTaB~1iH!;U?UG3I+VscaHbz4EI3@ZZAx%%pp9{j2a4b_(Qtv52 zfo!kDR2J5gLkrF~HMzN`*bBr*_gP;&FOH5hoNiYFVti5hI0_X;DRmt1eDfGHe+=1i zx_u20>IK&BHV1)+ZDY_z*_`K-LLfd-9QW&)+qwMrM(f}8qG1%~rQ-Rg9y(jOw; zt{dM>D|S?1+rder6VF)T(_VGwyRGD})z`l_=~T_q;JCv{;#+AwNOtMmKY{B8fcWPx zv$d+)u~g@?dGNr@Y~bv*G0LmnjH`H_u~V4z1O$F zt`ZFJ3cKtM&$rV(H@(-#StXeE@gEd01lx!X_Ar_L+$;$&_0qm#r)MO=z_SOags3W2%{P#(#f>deZpsx3g@ z3^mD3>}#yyLCRS>xv{seWe+l)*T|T#3xJOc6Y0QvqY?toVP+B7%YPFm?q4Y z0Npbl3RtraBSuGu0lupz#A zT-eCFMH2zkJTTGH$*Y|%Wvh#Q*+_B!0nd0!YjmjZZJ-fL$WS^4d<@aB+D@uTvCrOz zDz7mfJzZd;4 zjh7=DK9)y{W@@x`zqaymbCK2K`LQ;wrQDRR46AJjqS$l>L`)B~g3hZxiQ#_VEDVYh z8OHyvh0eQq?K$4>bPkTTzSBGm^szZ(qWcl{*sQzw0XO^yg>Owp;Wx9(N5zIoLFpEn zZ09DGChU(lm%g$|_wJ<=Gj8dMKwC1i0Mn!Fv5I%SKrdU}nOkQB>Jgt4h( zBnpi&#Z_FBgt=<2gfwaB=x_ym$w6*_vY4H;=g~=G< zRjZU_HNtGlD0x{XA?Q^<-l0+w+!m)bqe%+>l01gYOy?N`0VRzt_qCVN` zCqmg2r>180@@bfLJ$lmdiTcEtB{Q#P=tv^nkw(D~&s(nAa;k;I8&z7iCuD3Hj{;7m zA~^w=JSDt=l|047pSxjOtX0!ibO%npRg^Fkw3$btc9_TS zamFjV^#G>CPkLsIQ(&&dVv>O1JZ7?5@}{$gN@6{|rLtwj#iOY2nE(>Y^2ivRwSP&7 z_KdYeM_(`|)fO+!!>ms<8$4a13_e6fr$kGOzUk;tUF?gTYMB#Jb0L*a1H+O|N0Ky? z-?@%SG6cvNmMtg3x@FtUZRxuIYVjRRqGLZTOX-ivmeFz9iYb~Vh#<+T zSqhf62H9M-lLfB;-KG_Dczn(uA_3HAj!lXu?9dI0O3lZmslg{_&>Z^5r#b=7v21JPSg z|Gj7GhAQr}0&=db0r?f1=o(!q>a(;&f@mOnOaflflFud~sith5776WjoV=6i1nh7x zR+ra%_n~=sBANeWz%B{ASW0pp(Pw6a&W(!ib^71nw$bwZcRZ>Gx!irOp6xDNQwIio z)A4~nEMid~2pUC<<$X$qGyRke8OrI~APeuH#O>d_WMtruLNveg%OeiwYP5`6#Dw0FIS2RNt4-8l|Kd@o|T4m7*_4%{@H4 z#6D8f@?1U^hp@*YT;Hf@C#%*vMyQ~ND!CY}QK}~rdG1+dJDwqV$_rEtNI34VuRz-ZGGy3==oq*0Tr*S1l1*XPtOfI&VMq zlC&)TmA@K+mJK9B>&7h%+DkFen9cs=$GEnRKI#Z*n87=-;kEH{idd_>1bOy$kcxH< zCdEZ{K@gA?t$>UWWNxqm9>wnR^6TH5FQY zV*|Cj$bFlI_TJm^8@;0}e^bs&#A{Xhc({)y(M~kN;ofUlsHh)0SvM=JT4G-T}OQK6wjvn2lG%>d20>A-ipx_#U>~#v}c7k5%jHf{i3i zEl^rfK|ohf@m$cb=&xAQJO35f$MhIpkp0tknJupGwusr`K>)rD^9xWfxLRL#m6`-q zF#hQ&RuNnKePQJ)|5ZbZOOhPcsrI>^+fS0ElD9U^(-bNw3YN_}!xTPpovj4(Ls(?+ zM5GZT9}b#S(?klpQ<}`D1D8y?i5*dh$u(c*N{{XipPpM}3+>w!giM#*OtNf@_R_9-=)bkcIsOO!#OQ0s#Sb|CMYz zIeS=}{NB@?>a04hiXnS%mcBxuX1fxg>U>g24vUL3wmQ>XRiN>>;Q(^Ig-3jHyZ6jQuRf2=5j>g=&6t%S)=h;6X+al0nwfUb)}Oeqrb9kuTh zX{nbY_UgEZ23?Mu{k^iv7}LHUTo5S6)<_V=lAQRF&`ot8zhA%1^FPVG$=InCF+50E z5+ZNm@R^K$8=9XyWvk;qqko-@{1ndP&SmC-Ie1?iP@CvMgY(%fb^l7>e2a#j?j~j4 zt+Qcu?k$z8dYWl0d}Y6kBMjoXbXq*C!0}!!NgFn5tnek;vbT86vEr$rk=EcyxFxd4 zOYi*nwGH!yakcI38=H)=5(a3}*}9HpZ|}4RGJmyU>hMy|7Yi^Hq1r=Dc;I(c8jU>o zjV2?bw~ELm!<*#uJ^!GZbJTQnZLzT4E5Bnwt=$O`)|O;B{*cR9akro~2gWIz;YhuWP1Bko2(IT)zcL72cEk60E>_@Qyq&_8=h}OO&vbIqp~GDGqqj90%N}r~!uuM(n|@ zLdXoEKsx$z&gZGEv$N*aDx@#XtMpKSq6!F5fOZm`-#wgvZR5>&-m;C}09*=S#b1aR|qaI^fZQw*=nza20ZJBYE0phud2qrsl`)nLhUV1AQewFO*Q16*jj}h7r8FI3;|Q!x=mgZ=af-Jsdr;}99wr(WC>Fx zc0oSFlS(obTGg{DKfExY243sakR=N+RR=+M`&L*NCL1=?bZqh-RZ;ykYROC$>pCff z{F4c3i&zHp%YFNgROwNCFXHtO5+pgEJmGd~CWFrvbdSU`66C{One?VK(Jqzs;M1>) zdVJG}pN+#e$eq)xISK%O?nXwxgaCVhyW9JfusuCG)wnXKFsM6en9$fTgAM5=LD$tV zzXg{&W&DqysfX6|p(DcBr(9&sLzpB;NQA={rWpeyi}I28c5ggLG~wn`w#>HpL{uC| z+j!~pW8VS-Xk*D6FGO^X)2c0eTU$f^K^+l9?u`v^~V|Cv`&#&jYi(jMz6dqH-^RJoV z#P-a3*@3fV_efICk(C0zUAJJ9tOt?NK71x*O5#Nk?38%`C8mhMu3wdwo?j&;VnuL3 z5h%rW5#m^8vsWg0A0Ri@QBqHMB`OZ7B|-8pZftD*VLp0mw%C|@#4hORw#8;Zm>o*8 zow}2;*2tvS$y{Y(?W*UNue5w4{K7!(-=?8*R@EDU(5RUN^_;eR2cACg*;+5wg5`NA z-<^}58(Oar6QK>qDq5GBPOr^^QFokVfhL+;^E=z%Dy>@GF6(G~R8sZ$Y*dnFkyVn7 zr+aVhpqMy^#Z5NVpcBWYL^_7bK;(EMr=rR7u0OerZarDz4<^|8b&w%h_VMPkw$MSF zXoXmu55(l`l4d$BgezcosXmeOiFhb1E?)DqC?C&HB%F(e9#0fgT3=V-iq7d>I+9U} zSPleVS&zlLkAU?8-6n)xd5f36^mpoY74L-#EwuX3U{-Fx8 z+5E$q!!-2Dvj}y_HcEx(7|0r8W)Y|@WW78gm}W7rDYPwQg1lc#vOFQQW-%fNtumw^ zI#q`6Rv`45gHX^NpHL9RZ(s9Vp+6vh)G0k!+pst%RSxk-rwSB+K@-XeDm@72{eQs# zb)nEwyr9tXa)O`3AU%AVUG)r5G!BkA$sT{@@XNU}(3rbsOZZEm?LzmR~q?A#=u z+_8T?KIG+ZHQo%Gx|ariC@v!{E{Ryt?zDx9Ygb4wTBPkMcbY@@K1uq)#9;~3s+qhe zOnL1H7P0zWdi-I^U<19vYTrI&)zEnyG0I-cxDx9-ux>y!sfQdnMcGNTj_5^}?c~kY$*Pvsu zZbdkx_F`~`H)UZ8yHY}cgn_}ZBp4DP4XDHE3AY5=kz)ft#&H}GBf}~fe_3&Sg;}eV(A`ujrYz1$yHS~Z9Pkvp>_TeT z>yQ+DnGEiR>++)ZecWMjB{&}A%7G8{xfF8Es#+@gB;sV)Dq=9>#lJpO+qic?*!`2L>v%3j^z8^Df8z{IKErh zJW>Z$B;UsmVmOY?VN6+7QH$=wRHMriJn)q!l_(CBlKbiKj@d5YH8xqvgs<33DWNrY zsC|a~p%$)ngia=%HBWEJBCu2P#9-L#x`()TSDSi*H9EGDSe7)X&R99Icf>QgQ@rMF z&%lGD+fmcxezt8Aok7nxe0R%sD}3GS^lH^mxh&$XBpuZ~MPqH6%*owQind=SL(oa4 zB9+9oDt5xR0cTOFSE>SkuPaKUK~F(F83BDC@1dj_qBMfVk`Tq zNHR4aDX2~>rM6*RNiSvAIFBB2C^Xbw>42gq@dnd+7-2u7wsmn&XX+N2qM~wm&M1Pd~}m z<{CJ?f99e4A@RAp_#9LgW2$z^LY{+9>D`7aP}9X?5B|PJbNpWMW_}h&He*Ad8kuFo z{#wUfnDYlc7NYCedr7A> zmt^6WxUeEKo`isLp=SeEalCQ5YGrT2PxMvwM=((;DY!f<+e>Wt>#uamtW5=ztt+dV zKr9~&it=6V>J8&COZNyal=jT!Mx;u!#5Pm>;-0HK;yxSio7dPAq+OQ8g|B(J=X4FW z)c6=CCE2f-J>dtU==*dq@0+z{r}UE<-8e?nZInWv!Sz;b&K+~l;CXXs?>m~G&jF{u zYLyk(-p!oD!Uw3lEJK<{a@hwlo@`AZ?!!yI)2HWAvZBdi6o`u?tZ?Zpmd)Hx+GmfD z#u+FXJ2Uc-7%p84PhMuL02SBG7rBQRkIp@6TSta0)fNmd_AXSyxPMZWt{CqhVML;C zq4Ceng@LJbs7sVg|N z@qSb)Rn%}UbHC&5&W$1`mKJ&*?zS@ARvhVFz^m!5ZciOv8S{);hZ_7S@nmU@-gv<% z?cklKR<=&MjdPwh!fjwM7;oK&?c<0=TMM0j;N2j+D@+f*~jXI|BG?4VMlnT_e zxlcC!JhkyylVw(~7Hr;q7X}t3j%t;wA6BeWwW8qXe00?FWdqcN=Rb+-CC+iU)K#kx zapSd^Cabu~sE#^XMqMTX^wh z=Ic+mHy=wZ)CYyKle-Sw@=)XO!A`FArh%dPB2FF9X0AND1RA^B+`QesdP{NYWI@sv zQ<$_G(DHkCxU;RjJ$xRl$8an%6swqAJEgs08%VI30A6?HBTc zos3OB_9iU_cbq~$>eQT^ud#z9DwN3@{+;}Em4>^1giNGCaWsP5;D@f)kAPJ$T}q4Y zx<_OH(}04#5cipHDgz3HSB1$UfV$uVt6P*?7BcNyE&#t8qT^0E;Bpv@vccZLN+O!OOKd-Ybi_O`gPgPE%fsv|hJ712x!ENqbc{@qhs;!tkL7)1D_n`_~ zipYAuo&`EX+9}U(J{FD=Igs&seO2Vrut_a<8%QQ+H+bL6TbQLv%0j4{8ArVrnS3Zq zKBCt>5Sz4k_QR?+@lNyL56@|;NnEv}&3c!ajoAx5IiI)Iw@(K6<&9PD z6UKJ|D9dcZ-$2lD)2bkdOT+8$?^fF%MhZB%4v;l)=7+zPFdt2@E~d;yn}ttXGkV)7 zB6b+fbiZ+>{T%uLdf}(C!A@FEd;kYkLFmm>@54}k!iS&1y8BWy9ktpY`@9nK+#gHK zTo$ptwLVBt5A=6EtM2;c7Ip+gnEtLK?w zm`X`!OuMVCGOI!qShO*|b$kHFalqKfj6P`R^t~FH!IYhQ zfBU^lJgY|8QBxV&v=}&GfB8&iMEN@vTve-LP6f~oC#eN;p{co5g(L%;H4G`MAMrOU zTPhvJ6U9s;(dDRp-+YGlUQ+dul7eQmpYz2bvT+%aah6r#vcAl)9}+%R^pJdf7}K08 zXyKZ2p&L?M66;ZMHjnx_9fDMT!d1Kj@A8cD(xTbvqIMf(-C^ikn^4yWttF7O?Q}>~ z_x=WK`h1*ESRY-|+f>`!C6E)4t0<+z_@@R`vMM;;vl8mMA5azgxMCzphw zpOAy~&Me~}>-0S|f4qlXDLWt@uBUz(KgIZ`UXi%JK*9}Rqay$tMf|Ic8ra+a)+heQ zL;>~**fJDHWdT(6cH%1ph4VQJg)(THNO`2~?tI1BJL~22zV9`(t2uWU%Pffg&9So+ z@!t9&E6XZ;nW!GYs$2y&9V2dgz4J_yhRCW=J2aKoC<8A^db-_Y9MOap z5m*vS%_HdvFK|W*Y)fGrxs3&~dqUL6J@U+IDw^9J@fWl!eWs!{`tH)^Xrql1P_m?K0;<0O2SY?JDVe14Cda z@7{o@>-DrE2@Z+C?AGmpEJ5~W-it}anm4>?jQ+?gQ@r!MOLw{H9uW6svR3GX$dt{n z#T0g*NjfKY@)jKMFAlgIPi_P5QH2h$lANSAAOd+N)IZatrx~uvCF*EuzA0$CK)+(W zPR9-6Ih?cZh%w%J^5l!+KJHFZSKjrhW>;WCBjES>>v4OJzy|jU$iM52Ssoif@ zs~q!R+%KPgbNW%mOm((K&~>yKl`es31)3>h_1P)Kr;EfkOSfbgCl0s$QE}LD(7SCX zO-MWL83Cr{Cugl8ZZ0YbC3(^^A=&B}If~ud39>_m4+VtU-aUmvim6eKggD#YhUF=6 zc~mw(O(#jYt!1KUxDVhMrbDAdeY)@i)dWn?sIDa57hwvrJO%frGA-!8eOqyq z6}0le9#kU#_Q)5J2rrMoVL|jG$?EfS;)c!qJtRGZ4Wey=;-ZY%pv7cjKCxOFG&km| zgRKj{a@sHy;a1&XZOl_Np5YhVpJ~fnb?>W-f2NzjAhdwg@}FPC0W{a&wtsjnM_%gh z3jY3Hz+cg|fE@NOZwdSg{FMOxClmzYAL!7(!vD^8{1XfWqzv;1{D0s+{;KI$2Hc;z z!T>qqKl0&zRq-o_;7=9dfGqxZ6@O(E{0jedS@2KzJ?6jQ|Gh}~EBMzT!=GSG%0Ixr z3?P2h@N3fhr-l@&KQ#O;1^yNP_Y~$&G!PIp?eG5g_k`wG_}`<;U*VPXe}VrIYko!l h>h6EywV3`d&Mz Date: Tue, 31 Oct 2023 22:21:25 +0800 Subject: [PATCH 2/6] Fix tests --- test/test_retrieve_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_retrieve_utils.py b/test/test_retrieve_utils.py index 597808191032..b74ce75c4641 100644 --- a/test/test_retrieve_utils.py +++ b/test/test_retrieve_utils.py @@ -166,8 +166,8 @@ def custom_text_split_function(text): ) results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1) assert ( - results.get("documents")[0][0] - == "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities of Large Language Models (LLMs) for various applications. The primary purpose " + "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities" + in results.get("documents")[0][0] ) @pytest.mark.skipif( From dc6f7cc4fdfb28fbe00574e0a443e7b44acf2ab8 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Wed, 1 Nov 2023 09:33:31 +0000 Subject: [PATCH 3/6] Add test and documents --- .github/workflows/build.yml | 1 + .../blog/2023-10-18-RetrieveChat/index.mdx | 11 +++++++ website/docs/Installation.md | 30 +++++++++++++++---- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5e5fd186beac..7fe4cc4fa738 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -40,6 +40,7 @@ jobs: python -m pip install --upgrade pip wheel pip install -e . python -c "import autogen" + pip install "unstructured[docx,pptx]" pip install -e.[mathchat,retrievechat,test] datasets pytest pip uninstall -y openai - name: Test with pytest diff --git a/website/blog/2023-10-18-RetrieveChat/index.mdx b/website/blog/2023-10-18-RetrieveChat/index.mdx index 71d2ad3f46c0..f5c5ae56d18b 100644 --- a/website/blog/2023-10-18-RetrieveChat/index.mdx +++ b/website/blog/2023-10-18-RetrieveChat/index.mdx @@ -54,6 +54,16 @@ Please install pyautogen with the [retrievechat] option before using RAG agents. pip install "pyautogen[retrievechat]" ``` +RetrieveChat can handle various types of documents. By default, it can process +plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv', +'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'. +If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html) +(`pip install "unstructured[all-docs]"`), additional document types such as 'doc', +'odt', 'png', 'epub', 'jpeg', 'pptx', 'xlsx', 'ppt', 'eml', 'docx' and 'msg' will +also be supported. + +You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`. + 1. Import Agents ```python from autogen @@ -474,3 +484,4 @@ The online app and the source code are hosted in [HuggingFace](https://huggingfa You can check out more example notebooks for RAG use cases: - [Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb) - [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb) +- [Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb) diff --git a/website/docs/Installation.md b/website/docs/Installation.md index 2cacceda2c08..9195840cc49b 100644 --- a/website/docs/Installation.md +++ b/website/docs/Installation.md @@ -50,7 +50,7 @@ conda install pyautogen -c conda-forge ``` --> ### Optional Dependencies -* docker +- #### docker For the best user experience and seamless code execution, we highly recommend using Docker with AutoGen. Docker is a containerization platform that simplifies the setup and execution of your code. Developing in a docker container, such as GitHub Codespace, also makes the development convenient. @@ -59,7 +59,7 @@ When running AutoGen out of a docker container, to use docker for code execution pip install docker ``` -* blendsearch +- #### blendsearch AutoGen offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Please install with the [blendsearch] option to use it. ```bash @@ -67,21 +67,38 @@ pip install "pyautogen[blendsearch]" ``` Example notebooks: -[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb), + +[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb) + [Optimize for Math](https://github.com/microsoft/autogen/blob/main/notebook/oai_chatgpt_gpt4.ipynb) -* retrievechat +- #### retrievechat AutoGen supports retrieval-augmented generation tasks such as question answering and code generation with RAG agents. Please install with the [retrievechat] option to use it. ```bash pip install "pyautogen[retrievechat]" ``` +RetrieveChat can handle various types of documents. By default, it can process +plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv', +'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'. +If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html) +(`pip install "unstructured[all-docs]"`), additional document types such as 'doc', +'odt', 'png', 'epub', 'jpeg', 'pptx', 'xlsx', 'ppt', 'eml', 'docx' and 'msg' will +also be supported. + +You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`. + Example notebooks: -[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb), + +[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb) + [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb) -* mathchat +[Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb) + + +- #### mathchat AutoGen offers an experimental agent for math problem solving. Please install with the [mathchat] option to use it. ```bash @@ -89,4 +106,5 @@ pip install "pyautogen[mathchat]" ``` Example notebooks: + [Using MathChat to Solve Math Problems](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_MathChat.ipynb) From fa4523efee91148be6c380e674743a8ca2d42f89 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Wed, 1 Nov 2023 09:42:31 +0000 Subject: [PATCH 4/6] Fix tests --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7fe4cc4fa738..0acafd7cc0a5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -40,7 +40,7 @@ jobs: python -m pip install --upgrade pip wheel pip install -e . python -c "import autogen" - pip install "unstructured[docx,pptx]" + pip install "unstructured[all-docs]" pip install -e.[mathchat,retrievechat,test] datasets pytest pip uninstall -y openai - name: Test with pytest From 47b744f33c98656ce36c9c780bef2a403a60d026 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Wed, 1 Nov 2023 10:37:23 +0000 Subject: [PATCH 5/6] Fix tests --- autogen/retrieve_utils.py | 24 +------------------ test/test_retrieve_utils.py | 5 +--- .../blog/2023-10-18-RetrieveChat/index.mdx | 5 ++-- website/docs/Installation.md | 5 ++-- 4 files changed, 6 insertions(+), 33 deletions(-) diff --git a/autogen/retrieve_utils.py b/autogen/retrieve_utils.py index 217caa5949eb..b98ba862d1a0 100644 --- a/autogen/retrieve_utils.py +++ b/autogen/retrieve_utils.py @@ -40,29 +40,7 @@ "yml", "pdf", ] -UNSTRUCTURED_FORMATS = [ - "eml", - "html", - "json", - "md", - "msg", - "rst", - "rtf", - "txt", - "xml", - "csv", - "doc", - "docx", - "epub", - "odt", - "pdf", - "ppt", - "pptx", - "tsv", - "xlsx", - "jpeg", - "png", -] +UNSTRUCTURED_FORMATS = ["docx", "doc", "odt", "pptx", "ppt", "xlsx", "eml", "msg", "epub"] if HAS_UNSTRUCTURED: TEXT_FORMATS += UNSTRUCTURED_FORMATS TEXT_FORMATS = list(set(TEXT_FORMATS)) diff --git a/test/test_retrieve_utils.py b/test/test_retrieve_utils.py index b74ce75c4641..7338f731517f 100644 --- a/test/test_retrieve_utils.py +++ b/test/test_retrieve_utils.py @@ -14,6 +14,7 @@ from autogen.token_count_utils import count_token import os +import sys import pytest import chromadb @@ -170,10 +171,6 @@ def custom_text_split_function(text): in results.get("documents")[0][0] ) - @pytest.mark.skipif( - HAS_UNSTRUCTURED, - reason="do not run if unstructured is installed", - ) def test_retrieve_utils(self): client = chromadb.PersistentClient(path="/tmp/chromadb") create_vector_db_from_dir( diff --git a/website/blog/2023-10-18-RetrieveChat/index.mdx b/website/blog/2023-10-18-RetrieveChat/index.mdx index f5c5ae56d18b..362e5a5cbf3a 100644 --- a/website/blog/2023-10-18-RetrieveChat/index.mdx +++ b/website/blog/2023-10-18-RetrieveChat/index.mdx @@ -58,9 +58,8 @@ RetrieveChat can handle various types of documents. By default, it can process plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv', 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'. If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html) -(`pip install "unstructured[all-docs]"`), additional document types such as 'doc', -'odt', 'png', 'epub', 'jpeg', 'pptx', 'xlsx', 'ppt', 'eml', 'docx' and 'msg' will -also be supported. +(`pip install "unstructured[all-docs]"`), additional document types such as 'docx', +'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported. You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`. diff --git a/website/docs/Installation.md b/website/docs/Installation.md index 9195840cc49b..e32f4b58d025 100644 --- a/website/docs/Installation.md +++ b/website/docs/Installation.md @@ -83,9 +83,8 @@ RetrieveChat can handle various types of documents. By default, it can process plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv', 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'. If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html) -(`pip install "unstructured[all-docs]"`), additional document types such as 'doc', -'odt', 'png', 'epub', 'jpeg', 'pptx', 'xlsx', 'ppt', 'eml', 'docx' and 'msg' will -also be supported. +(`pip install "unstructured[all-docs]"`), additional document types such as 'docx', +'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported. You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`. From 9f060df3844c29820d1562d408f4630f9e932ef9 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Wed, 1 Nov 2023 20:15:57 +0800 Subject: [PATCH 6/6] Test unstructured on linux and mac --- .github/workflows/build.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0acafd7cc0a5..a59879ff8a37 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -40,9 +40,12 @@ jobs: python -m pip install --upgrade pip wheel pip install -e . python -c "import autogen" - pip install "unstructured[all-docs]" pip install -e.[mathchat,retrievechat,test] datasets pytest pip uninstall -y openai + - name: Install unstructured if not windows + if: matrix.os != 'windows-2019' + run: | + pip install "unstructured[all-docs]" - name: Test with pytest if: matrix.python-version != '3.10' run: |