From c5f07c028d803bfcde1895b1ad81f9525f860caf Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Tue, 31 Oct 2023 10:01:12 +0000
Subject: [PATCH 1/6] Add support to unstructrued

---
 autogen/retrieve_utils.py    |  38 ++++++++++++++++++++++++++++++++++-
 test/test_files/example.docx | Bin 0 -> 13989 bytes
 test/test_retrieve_utils.py  |  38 ++++++++++++++++++++++++++++++++---
 3 files changed, 72 insertions(+), 4 deletions(-)
 create mode 100644 test/test_files/example.docx
diff --git a/autogen/retrieve_utils.py b/autogen/retrieve_utils.py
index bc4fdfb75976..217caa5949eb 100644
--- a/autogen/retrieve_utils.py
+++ b/autogen/retrieve_utils.py
@@ -15,6 +15,13 @@
 import pypdf
 from autogen.token_count_utils import count_token
 
+try:
+    from unstructured.partition.auto import partition
+
+    HAS_UNSTRUCTURED = True
+except ImportError:
+    HAS_UNSTRUCTURED = False
+
 logger = logging.getLogger(__name__)
 TEXT_FORMATS = [
     "txt",
@@ -33,6 +40,32 @@
     "yml",
     "pdf",
 ]
+UNSTRUCTURED_FORMATS = [
+    "eml",
+    "html",
+    "json",
+    "md",
+    "msg",
+    "rst",
+    "rtf",
+    "txt",
+    "xml",
+    "csv",
+    "doc",
+    "docx",
+    "epub",
+    "odt",
+    "pdf",
+    "ppt",
+    "pptx",
+    "tsv",
+    "xlsx",
+    "jpeg",
+    "png",
+]
+if HAS_UNSTRUCTURED:
+    TEXT_FORMATS += UNSTRUCTURED_FORMATS
+    TEXT_FORMATS = list(set(TEXT_FORMATS))
 VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"})
 
 
@@ -123,7 +156,10 @@ def split_files_to_chunks(
         _, file_extension = os.path.splitext(file)
         file_extension = file_extension.lower()
 
-        if file_extension == ".pdf":
+        if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS:
+            text = partition(file)
+            text = "\n".join([t.text for t in text]) if len(text) > 0 else ""
+        elif file_extension == ".pdf":
             text = extract_text_from_pdf(file)
         else:  # For non-PDF text-based files
             with open(file, "r", encoding="utf-8", errors="ignore") as f:
diff --git a/test/test_files/example.docx b/test/test_files/example.docx
new file mode 100644
index 0000000000000000000000000000000000000000..f377c63c12986bc872f2b22d5bc316c941fb341d
GIT binary patch
literal 13989
zcmeHuWpo_LlCH>NW|qaw%q&^V%*@P^Ww9(~X0Vu<$zo<Ei<vE^*E2hJXU2E-y>s5*
zz163)Ao7dq%8tm0%=}7T3KR?#2pk9!2ndJ}$ah-1*8>;`DDDFg5DE|^h?cOOt+R=(
zv!1ety@`_!ox6?o$2>3)iX0#iK>h!0|BG9oK4H|dmjO}qKIkoAy0K5Pm7<_;I1hp1
z=LZ>neWFMdB@N`6-P?P-!B9$4`ZQys7_;*ZGh?asw|16cRAHKoaS?KxaIgx;*6Mue
z&v03qO+=w!ssRq@%r)H<PaUJ-=lWZWWaW*FC`Lp+<R;2SG=zl{`FoP*LQs993~~5#
z>=E+1O{?sRoKof`qKRJ*T?iPyhUV@%OjKFL`I3e0cG{NLG68R-3jsUm$X2|?&uEf<
zE7!j3bJ*HJV~1>E%q_J>P(Rx6uoB8;eDoEtlXpzcXvryXwN$KSTDPh5o89xst)5o1
z<>Lrqyv|A{v!NnIbsg_YB%b-M&%-Nv*2a$?s%{VOn|Pxehy~9-DRwpbnf}v63n(Tw
z53iU}2ZsLxnBnUw?s!f8GTLw^LC@n5IoFHP>1}Mq94-EmQI6rg-Xclu?evX-3y%5R
zSF<i6KX8yo5Mh={Ha7@?gp(i2SU=>_s=^Ad&5`%6#m-&<#`gXW3MBtGk9|%x5hnn6
z><@rL!T>y0&(Xx%iJtDa@;{#Xzt}zh^62GpJ(8e|h`|^B7l@7*6Rv1TM~xUr=%^yc
zpb^8TY|X=gB$m&2g)@27XEValj|bU4omfm-<u>$ulp?*ANwawg{uzcg+G?YVd%e4=
zVJTA<r^^bo?5KN&UpyGSa`BaH!>D)ihbZY5924WZKtn7=(T#etv@7dzX$oIp@!XX!
zmXMT9JV3YXf-?}9MDwk%u)Rt99e0irtZlXO53}aKk|x7P0BMptrv2>ux<(K;yc$9p
z9HmIhT6RDqE^rZ!OS{FO;KDV1Wz(Z56bU8hkmHb5VFYXLIe`L)*TtfT%W{~>Bj)5C
zIDn1D;g}BH@5)XFs!W<N&ksIt6hVgbr5=B?m|$bi!l!-Erow^tB2@p9{$TST{b{o0
z)QK1j2xtib2nY#K!_CgonBLgV$i)T_27ilxeaWk~J8Xyon=n5KAbA@E40S$7B6-z>
zo_!Pm0UN&qH4jY+Dfs07?bN-iQ&{raB(bp8zwt{<|0piJSupe5S#ncGF0DcuG~dm4
z-5-SCjbi1qCffPCDk{nGXTWd3n0toDR#DEpQ{SIP&ur5|nunm+ArZCZ+?cXFQW+hO
zpSdPJMY+%y)gWtHOVgvb(6ixSqxg_3Z;K0JHnW$ANY@A7o-=M5o~blHRN&DnxI=tL
z=eXQ~r54_Zinuf!n(DeDmyw?X2IG$SPFCu}Xt}FZft@2rj%Ysr5w0zEV|lF@hte``
zVExl8nZ-=x9Ar^D)(;_M_yC+-N9&vtMgGBKFTyHdnwngzk>X`gX3Tj^>BoC`Y`;0b
znEUu}C?kFhD1+B;g(0bG6nquCRS4Irw-%IouXrSoR!uZCZaeYpz4av~Dmbf&+u>9-
z9A%GT4AuERxbm-K@UAOaFvl#1?ubFHom3}RhpYD#p4;XcDc0@58Wp;(btWXo`N@I>
zVeB;f1oI-H53Cl_b`g69gSb=o`<e|#ca=c%!1Yn%llt>arKy4m2;`dfrjiCr_Y-t$
za6MV9mzW^l%EzDt$jOH3@<bYRys5+CCaU?sT1n$Z6R`2^pI`dKm|oWe-R1LQ#Iav-
z&#qFC-R>M6fy=n$i@s|6$fhpBN)07=@OVqN$h(gV5Od%|65(<pERlnSy*gFxj^AqO
z5vLoqbFP&x5G}9zKp6#;^g;6r$-^h-&%fqv-fPp(kr;Um&Lemh*#}6yrFaNVbscTe
zp7zyR39>co7jj-Z&g`Kp3t1;g_EaxRP3cH|Un=uMK74fbuSH;f*TlRMc&M<$xjTB=
zOxC^LRbEK*EG3OH(5A`@T|u^sDi%v2&ah^Tmhg{2ge4W3O=v^^uCHg4V<m-2LRo;c
z2Pw%+4;L*hHpO2OPnE%bk8BjG6I3YDM5z4i$60%n{}EcPOz^?(>5!N?49@dA73t-!
zh2PfR@`6|77cWoGg)ZWu6Z3F1v=1p-l43ix<#$u;)!E9ZbaY-MFr*&>sirhN9{9r$
za6WVgeHZzt8pr(byaAa!`h_L+MQ>w@1p>wHj*g>@#2~Vjj-rl0&5X`hV(BCJRYmz`
zM#vJ9$%pcrN<>rXCC`iYCE5kn9!^kNKD>k8{>6{j!EOghaObwcWy9_+rpxvWt!)Uc
zny#`fggH?S(`Za0IMv;WxBR;x!+jgNC!QAr{$w>CHC|5|Xib?vZy{iN;3y*U{FD&F
zY&bHy+v-xJvul_`e8V+4D2gc3OmjRrptPN6Vf%I*Qewx4&Bg46evEKOX*&@Ia#|Dy
zIgDGjsCQGY&odeyG}}IOevxm~md4uK4NAkTKaE~1*0Y=LNMh~f{}MO(4v4}39-$YE
z>lA(hdJ;xZARt^INZ{Xll>Z9Tf9+TP6|#W=$u%IJ|L@+)<Ni!8Q@m-pz2${=ttAOp
z7hZux@%f8spvHf0bya<mIGNIKy>zThahmjHTb=9)bBmbsD#{shL_CbB@>-;FFTt-m
z7eusDvpT_}!i?3|($%%jnzEFYn!EKA$>UI*V*J{WV9bTfV#1drMIRX*u+K7}3T~t8
z67HCI;Kn42C7;F6U#W`dcFP)K3Hw0MXqUF}BY@owl#5S^kl+42V(f@RQXNmr_(ME{
zlt|2hsH#-h6hXKVQ;|GPX?K2qNw+~(S(i?2)-2syBf-E$b&6B7#!T*26m>@vN_6%i
z1)&_D=ObXO_3u_==KE4V0S*K-{t*ZW4PXqvTaB~1iH!;U?<M1JvFKD|JRFA|wF~2k
zKcs~lKe{W~J%GPX#x<Vw<l8RA>UG3I+VscaHbz4EI3@ZZAx%%pp9{j2a4b_(Qtv52
zfo!kDR2J5gLkrF~HMzN`*bBr*_gP;&FOH5hoNiYFVti5hI0_X;DRmt1eDfGHe+=1i
zx_u20>IK&BHV1)<lpodjc&@$r`ZK=FUL1@<L|{bxGel#CK!|&h`mWJ3+#?!x{--Dz
zN|@XjnSBg$*euBoLQ2m{E$U!Iarh8@iusK!*(*%HT*af)%7pvS7YwLMu7N;2%K4#~
zsiooaZ=DbRZcw1)G4;5dlYv?A)1X^GAT2SVw1e|LvJUr_4zwa7QPyI%Iqyl1Qhvi@
zvyNi~6(80+7mE`Xo>+ZDY_z*_`K-LLfd-9QW&)+qwMrM(f}8qG1%~rQ-Rg9y(jOw;
zt{dM>D|S?1+rder6VF)T(_VGwyRGD})z`l_=~T_q;JCv{;#+AwNOtMmKY{B8fcWPx
zv$d+)u~<s|#AYfmo`kkru(a)H3M?u%Bx^-}{IuJz2^md^^U2qh4wVDM5GkJ$S4x{3
zJs|*ppV1EnqK_d@+qF%%gaou|+w&y4kz-AefjHAS@=-N;8}uP%OT9h=N3ks4ul@7Y
z_UYA4-C$QHuZQE^14ec9tAP6Mo&K)bp6>g@?dGNr@Y~bv*G0LmnjH`H_u~V4z1O$F
zt`ZFJ3cKtM&$rV(H@(-#StXeE@gEd01lx!X_Ar_L+$;$&_0qm#r)MO=z<nB_uDec{
zf}}=*_oDhacC;V;(6u2_UE2qZTdRSqCZplLivy>_SOags3W2%{P#(#f>deZpsx3g@
z3^mD3>}#yyLCRS>xv{seWe+l)*T|T#3xJOc6Y0QvqY?toVP+B<Beg}>7%YPFm?q4Y
z0Npbl3RtraBSuG<O9tK9Q@qwzBPG1CSv%PyI8)S=T;!slP~L-+uqB3Inzq9qfCQ@!
zFpy~M@}c@g?oRth!UZ@H@}3b33HtbRG!3tr!UUYlP~0TVpFtQSrF|m191C9inqV5s
z%XI?I5K)ogfNETmeU%EU0101G9hb8YuV8Zofw|ZEj4u}4Ho3X#W`bP{5rUn{yqY91
zw-yb<6f{)WJ}yMu--y*HCT@0)liLxE=vQR#kJ$X23MXgOyH9z9<2Od-HDsEOGZmpS
zABC9Q5JzQ86V<;4xlFt}i^~TQ9@$sts{pBywXem@Z2Uw_#-2~BJM3=yO>u0lupz#A
zT-eCFMH2zkJTTGH$*Y|%Wvh#Q*+_B!0nd0!YjmjZZJ-fL$WS^4d<@aB+D@uTvCrOz
zDz7mfJzZ<BaKBJyU4D(iQwTRx3HX&gF<Nxn&=Q~f(z`UXJ+M||XLpN@I-Sl0<>d;4
zjh7=DK9)y{W@@x`zqaymbCK2K`LQ;wrQDRR46AJjqS$l>L`)B~g3hZxiQ#_VEDVYh
z8OHyvh0eQq?K$4>bPkTTzSBGm^szZ(qWcl{*sQzw0XO^yg>Owp;Wx9(N5zIoLFpEn
zZ09DGChU(lm%g$|_wJ<=Gj8dMKw<Tp6_w8>C1i0Mn!Fv5I%SKrdU}nOkQB>Jgt4h(
zBnpi&#Z_FBgt=<2gfwaB=x_ym$w6<Lo}&1&K&4h!mZSzGy)wZ+MII?*^*rvs^Jbb!
zJ1u9Kc*ykM8uHl03<ai35TOM7@$j&<(Jw8I&lQ04nuOV4-wu4eY>*_vY4H;=g~=G<
zRjZU_HNtGlD0x{XA?Q^<-l0+w+!m)bqe%+>l01gYOy?N`0VRz<dV)|R0E>t_qCVN`
zCqmg2r>180@@bfLJ$lmdiTcEtB{Q#P=tv^nkw(D~&s(nAa;k;I8&z7iCuD3Hj{;7m
zA~^w=JSDt=l|047pSxjOtX0!ibO%npRg^Fkw3$btc9_TS<bXAEjEiM<TqX5v&rqz>
zamFjV^#G>CPkLsIQ(&&dVv>O1JZ7?5@}{$gN@6{|rLtwj#iOY2nE(>Y^2ivRwSP&7
z_KdYeM_(`|)fO+!!>ms<8$4a13_e6fr$kGOzUk;tUF?gTYMB#Jb0L*a1H+O|N0Ky?
z-?@%SG6cvNmMtg3x@FtUZRxuIYVjRRqGLZTOX-<ou1-ag)T`z2nM=F)3<VLS)V%Sz
z-%E`g`l#D|tIx@+cQaK(RfL85w^n#MOM}<9&fUGPE(mqaeS>ivmeFz9iYb~Vh#<+T
zSqhf62H9M-lLfB;-KG_Dczn(uA_3HAj!lXu?9dI0O<Kis+M-?4G;`Ehi<NV$kzWd4
z%`L22uIKMZ+S3-#`2peW-vb~UdQJo{U?2t$`<H>3lZmslg{_&>Z^5r#b=7v21JPSg
z|Gj7GhAQr}0&=db0r?f1=o(!q>a(;&f@mOnOaflflFud~sith5776WjoV=6i1nh7x
zR+ra%_n~=sBANeWz%B{ASW0pp(Pw6a&W(!ib^71nw$bwZcRZ>Gx!irOp6xDNQwIio
z)A4~nEMid~2pUC<<$X$qGyRke8OrI~AP<Z6pqw5;=(0IdjVetf3@sG8EWX1b8;~V7
z7NB}nUId9nnK+fl#zunTA|tu>euH#O>d_WMt<VVTw8JC@tyC;%zm8bKi}S3(281J5
z`#Me57~$EagWRRx#+_R#+$S|s2do&1Yz2qWS4<ysNqF@WYnt#LYWKY9u=pcS==yQ<
z_3{biz$D}hGJkv2PWeH|$Lo}@%2hiO2t39dOC+x?hEb5tCZ+o-G)6v^v5}(Ste*&!
zjj^sy!qV^#@2ok9W%TmW;9E(9gi5{QJB-y+*R7l7i*w}RKbe6w@jK2hciD&BWovCi
zm^B?BY70cWqxwg>ruLNveg%OeiwYP5`6#Dw0FIS2RNt4-8l|Kd@o|T4m7*_4%{@H4
z#6D8f@?1U^hp@*YT;Hf@C#%*vMyQ~ND!CY}QK}~rdG1+dJDwqV$_<q#obS5}VVfnZ
zuFhs&hxs^CoEsiICWm45ExC>rEtNI34VuRz-ZGGy3==oq*0Tr*S1l1*XPtOfI&VMq
zlC&)TmA@K+mJK9B>&7h%+DkFen9cs=$GEnRKI#Z*n87=-;kEH{idd_>1bOy$kcxH<
zCdEZ{K@gA?t$>UWWNx<!0)|q3#7+fW`K8Stuy)Rl5KJ9<&_-a-)yZ%fw^i%kW__G#
z%rFlU@51SYQUa`aIJ0T0$aD(O@nVeA=;)uDexg3Ur`*iWp3F&bav7VX;syCocGb?{
z)zEDt*9al8Xs#MyX+#UH(?i|6`{fdE8Ijn*AOssgm<Lnn^bmS>qm9>wnR^6TH5FQY
zV*|Cj$bFlI_TJm^8@;0}e^bs&#A{Xh<Qn&rZrFK69NE?u>c({)y(M~kN;o<vNiDl+
zBy-lDr){hot*yvAJVQByet&CmO_xq=g@#4&=0^<_J^$@-9#zvww-t#Ad9`8u7^oy$
zq=a>fUlsHh)0SvM=JT4G-T}OQK6wjvn2lG%>d20>A-ipx_#U>~#v}c7k5%jHf{i3i
zEl^rfK|ohf@m$cb=&xAQJO35f$MhIpkp0tknJupGwusr`K>)rD^9xWfxLRL#m6`-q
zF#hQ&RuNnKePQJ)|5ZbZOOhPcsrI>^+fS0ElD9U^(-bNw3YN_}!xTPpovj4(Ls(?+
zM5GZT9}b#S(?klpQ<}`D1D8y?i5*dh$u(c*N{{XipPpM}3+>w!giM#*O<B3*Bcc|~
zAPXL?z}b02TspG@2M%7T8F$Xqg5m9_?6>tNf@_R_9-=)bkcIsOO!#OQ0s#Sb|CMYz
zIeS=}{NB@?>a04hiXnS%mcBxuX1fxg>U>g24vUL3wmQ>XRiN>>;Q<RKB0}u_ATT<z
zc(DfLZw(uQUys?}kXWh?Hf09x^u$t#ll*vfV0T0EFjB{Ze$@Zt@@s%#_pH2#!;?MZ
zgQHKK8yDoYe*uz|$Ot|7BoD#M%k1X+-Oj9sE)O{m0*OFc_GE8(PdNqp!Sk~R$Bt9d
z`$u%1MD8M@^g5xpG5os&8ZT|SJ0V8BYND(@32mMW0zBLZ2z0U&f1<)!j$J#E=8T*A
zsk>(^Ig-3jHyZ6jQuRf2=5j>g=&6t%S)=h;<luG_+x7mOxWG9cxTZuHv)d3JOpW5$
zl4%6^hh6)_&_6IC?7@Hr2;YNyIT|pZXKJHR#5pn?f>6X+al0nwfUb)}Oeqrb9kuTh
zX{nbY_UgEZ23?Mu{k^iv7}LHUTo5S6)<_V=lAQRF&`ot8zhA%1^FPVG$=InCF+50E
z5+ZNm@R^K$8=9XyWvk;qqko-@{1ndP&SmC-Ie1?iP@CvMgY(%fb^l7>e2a#j?j~j4
zt+Qcu?k$z8dYWl0d}Y6kBMjoXbXq*C!0}!!NgFn5tnek;vbT86vEr$rk=EcyxFxd4
zOYi*nwGH!yakcI38=H)=5(a3}*}9HpZ|}4RGJmyU>hMy|7Yi^Hq1r=Dc;I(c8jU>o
zjV2?bw~ELm!<*#uJ^!GZbJTQnZLzT4E5Bnw<g#vOXGhJqhj!(c8WTcmV(ji0t)6qx
z8DpT>t=$O`)|O;B{*cR9akro~2gWIz;YhuWP1Bko<K0%wjLhXox6k-p;rSs&4WP3c
zxL2E|Jg;##<_)ilIC$Qt!SXapkN20x<#?3C`AwTbo$@u(=Q63=KjWD`%}tNG&o{HZ
z+d;VkLTPQbLm`N#eKS;UTeLriBD%+!bvk1*&Ov3g5G%`?ig3OTt8;r<@Iag8p|KjA
zg~df4A>2(IT)zcL72cEk60E>_@Qyq&_8=h}OO&vbIqp~GDGqqj90%N}r~!uuM(n|@
zLdXoEKsx$z&gZGEv$N*aDx@#XtMpKSq6!F5fOZm`-#wgvZR5>&-m;C}09*=S#b1<e
z@V{3ywV2j2Q6daeM&*%@-`FsxNmS<=E$C6g;LVE7AOVgDwZ0Qt!=W{Px_3pGCc5p)
z!l2TC?PEz9K%m+?T6f6$vKOj;=_?xBM+`RBEIootyuCAvm3<viL|Ei(%_W}*1%Bkn
zM|e5@!Rur1wuK1KXHp^w`C?qn{YhrhxRmaa1DY|B_ev-+bOOFUkM|nIk3I8D5*$bi
z<k%Rq>aR|qaI^fZQw*=nza20ZJBYE0phud2qrsl`)nLhUV1AQewFO*Q16*j<sDImW
zCBO)Uzq-{JG`L?`cOjbd4&8Wd4ku95Nj!@*wkqtQ9bFP0K(7-(YfLvdIf$@1kQABh
z;q7Ow5Xwtiw8BtmIOWi}BbZ`kp>j}h7r8FI3;|Q!x=mgZ=af-Jsdr;}99wr(WC>Fx
zc0oSFlS(obTGg{DKfExY243sakR=N+RR=+M`&L*NCL1=?bZqh-RZ;ykYROC$>pCff
z{F4c3i&zHp%YFNgROwNCFXHtO5+pgEJmGd~CWFrvbdSU`66C{One?VK(Jqzs;M1>)
zdVJG}pN+#e$eq)xISK%O?nXwxgaCVhyW9JfusuCG)wnXKFsM6en9$fTgAM5=LD$tV
zzXg{&W&DqysfX6|p(DcBr(9&sLzpB;NQA={rWpeyi}I28c5ggLG~wn`w#>HpL{uC|
z+j!~pW8VS<w3z7lh986(aAw@!4K&J+g_QGJ$$`rf;coVJ-SF=aT(U(36tSL6G!CEg
zvyLsTxSHQ)16iBjl)%Od+j%J7Nja7QXXI_*;x4s|P@^|X9x(Gb(Z|kGJjXX@*)JMA
zu_4y2(ud&HU?ey_$T8NCqO}=CoO@VSJl|0k+~*FY4w<l{b4ZG=-V*zfu&}Tr!l2;9
zL<7L{w*>-Xk*D6FGO^X)2c0eTU$f^K^+l9?u`v^~V|Cv`&#&jYi(jMz6dqH-^RJoV
z#P-a3*@3fV_efICk(C0zUAJJ9tOt?NK71x*O5#Nk?38%`C8mhMu3wdwo?j&;VnuL3
z5h%rW5#m^8vsWg0A0Ri@QBqHMB`OZ7B|-8pZftD*VLp0mw%C|@#4hORw#8;Zm>o*8
zow}2;*2tvS$y{Y(?W*UNue5w4{K7!(-=?8*R@EDU(5RUN^_;eR2cACg*;+5wg5`NA
z-<^}58(Oar6QK>qDq5GBPOr^^QFokVfhL+;^E=z%Dy>@GF6(G~R8sZ$Y*dnFkyVn7
zr+aVhpqMy^#Z5NVpcBWYL^_7bK;(EMr=rR7u0OerZarDz4<^|8b&w%h_VMPkw$MSF
zXoXmu55(l`l4d$BgezcosXmeOiFhb1E?)DqC?C&HB%F(e9#0fgT3=V-iq7d>I+9U}
zSPle<C#s(#%Ni(l@CPTh2H>VS&zlLkAU?8-6n)xd5f36^mpoY74L-#EwuX3U{-Fx8
z+5E$q!!-2Dvj}y_HcEx(7|0r8W)Y|@WW78gm}W7rDYPwQg1lc#vOFQQW-%fNtumw^
zI#q`6Rv`45gHX^NpHL9RZ(s9Vp+6vh)G0k!+pst%RSxk-rwSB+K@-XeDm@72{eQs#
zb)nEwyr9tXa)O`<r-XtK&2!V#;9i6NAGiA_+;Mu)|Hu7o-Yo1CTJh`n`_y<d<xbtY
zTuiucTi&SL@GO_`N>3AU%AVUG)r5G!BkA$sT{@@XNU}(3rbsOZZEm?LzmR~q?A#=u
z+_8T?KIG+ZHQo%Gx|ariC@v!{E{Ryt?zDx9Ygb4wTBPkMcbY@@K1uq)#9;~3s+qhe
zOnL1H7P0zWdi-I^U<19vYTrI&)zEnyG0I-cxDx9-ux<DtYwwQ0hXa|Dov0m_$}IX1
zkT&P)!W<67^(~~krLcqYm^u<p#Lgy@kyD9c>>y!sfQdnMcGNTj_5^}?c~kY$*Pvsu
zZbdkx_F`~`H)UZ8yHY}cgn_}ZBp4DP4XDHE3AY5=kz)<KF2ojkRDh#5Q7<I~NwN&_
zEoeRXunhGV5xLuci?E5@72xz=N`mD#0805w1(PBQv5EA5S4!xQF%LA5Z3($2!y0Ir
zuc@Z6P%Or%%{(ac-7>ft#&H}GBf}~fe_3&Sg;}eV(A`ujrYz1$yHS~Z9Pkvp>_TeT
z>yQ+DnGEiR>+<Ho>+)ZecWMjB{&<wg(WH#u&Q(htmKCYzo2`dsSn0GZ^RSa?SryI*
zGS(q=|3@(Zn^t`8A?W)3K?!iISu$WEW&4{R6o8(RP)qH`IT3&|N5y{$D*&oU+5aZ|
z{Z5>}A%7G8{xfF8Es#+@gB;sV)Dq=9>#lJpO+qic?*!`2L>v%3j^z8^Df8z{IKErh
zJW>Z$B;UsmVmOY?VN6+7QH$=wRHMriJn)q!l_(CBlKbiKj@d5YH8xqvgs<33DWNrY
zsC|a~p%$)ngia=%HBWEJBCu2P#9-L#x`()TSDSi*H9EGDSe7)X&R99Icf>QgQ@rMF
z&%lGD+fmcxezt8Aok7nxe0R%sD}3GS^lH^mxh&$XBpuZ~MPqH6%*owQind=SL(oa4
zB9+9oDt5x<bb6oUvOgz#z~y9>R0cTOFSE>SkuPaKUK~F(F83BDC@1dj_qBMfVk`Tq
zNHR4aDX2~>rM6*RNiSvAIFBB2<B&yMbhCsgvcn^emtjUK<+PSYp|{j=+UlXTUKRnn
znd${cyX<n$wCdTqICZ3(M1y^g{e}Eo1-_-Xqmgr(PJ3@?Vr>C^Xbw<eWax<M?hzkb
z4XeXZjkCf_66-Fv=n(yomz3?ji>>42gq@dnd+7-2u7wsmn&XX+N2qM~wm&M1Pd~}m
z<{CJ?f99e4A@RAp_#9LgW2$z^LY{+9>D`7aP}9X?5B|PJbNpWMW_}h&He*Ad8kuFo
z{#wUfnDYlc7NYCedr7<xH+AmT`_IJdu1E)3|IFj}Ly_0Tpm%mur#@!#_arpzx5>A>
zmt^6WxUeEKo`isLp=SeEalCQ5YGrT2PxMvwM=((;DY!f<+e>Wt>#uamtW5=ztt+dV
zKr9~&it=6V>J8&COZNyal=jT!Mx;u!#5Pm>;-0HK;yxSio7dPAq+OQ8g|B(J=X4FW
z)c6=CCE2f-J>dtU==*dq@0+z{r}UE<-8e?nZInWv!Sz;b&K+~l;CXXs?>m~G&jF{u
zYLyk(-p!oD!Uw3lEJK<{a@hwlo@`AZ?!!yI)2HWAvZBdi6o`u?tZ?Zpmd)Hx+GmfD
z#u+FXJ2Uc-7%p84PhMuL02SBG7rBQRkIp@6TSta0)fNmd_AXSyxPMZWt{CqhVML;C
z<au~C56OT~x^TQ=AlvRqI($Pu&Z#o~NXaNljzLN;lCZ?JOr1E8E$d(M()@{ol&DAd
z!#i@T`_^Y4Zqh=`oxI%$z?{K#l`=*fRb48Av2|3#@(N4e(o>q4Ceng@LJbs7sVg|N
z@qSb)Rn%}UbHC&5&W$1`mKJ&*?zS@ARvhVFz^m!5ZciOv8S{);hZ_7S@nmU@-gv<%
z?cklKR<=&MjdPwh<I*9i_q@6jYm-y^M7isI2CG%GW|>!fjwM7;oK&?c<0=TM<k*p;
z?OO=S!&2h<<9AZqQyMUPA9IwLb+S&Ygyx|O>M0j;N2j+D@+f*~jXI|BG?4VMlnT_e
zxlcC!JhkyylVw(~7Hr;q7X}t3j%t;wA6BeWwW8qXe00?FWdqcN=Rb+-CC+iU)K#kx
zapSd^Cabu~sE#^XM<sqKF=+5gN9fJPTzSG8AvE&-Dz%=in7ve2xx|)ya!5BXv3fC~
zLeRU)V{_TSlabHj4t6Zmvd-=LkdlNKzCYF2nKHt5e~7$&Jl-JMnk$E2(;W7*YrnIY
zk1jdOIC<|)nAfoAE#|FYJ5)6r-*vLq-38({NK28a+N|-c1NwHfSw+UE$Y>qMTX^wh
z=Ic+mHy=wZ)CYyKle-Sw@=)XO!A`FArh%dPB2FF9X0AND1RA^B+`QesdP{NYWI@sv
zQ<$_G(D<S2$I(b6+V>HkCxU;RjJ$xRl$8an%6<W?;iCR)g22thQ0Xu8kr}gkR^J&A
zfiCZRI)#VZ3P(^WRSjSVbqDlUAwrJkgGpQB!pqKQFIGyN9t`i}-tTwl2ygZ#mUmG=
zuzt2+!YRf2E~;a84fj8<S~8$OiSEIeObgT92^h9CccijwfS#y@#L6jt4=Wlw(pF|%
zNKN=gZ66k_9&j*@9H10iWI&t=O|a^jGd)k0)=)!szQ-X`wA~$Og-RopzXg1|;1{ja
z4y&&rB9~ZlRLr_)4i}sUU+foh;&3<UH1dGL+}h@wF%1kg%CTz*kL7~;HG0N&_mrOD
z2qJ>sw<K7H-^i7LL_CYT@bExdE{bv@<DJwATplUBG?|HSqs-b!ML99=SUd2GgpBOk
zMuXd~^IiQF=G;ncUscv+FDc$yqqfhgi|~@mi4Bxt4*|%DdB&qZTJW<koM6mZZ7q6w
z@_~5P5Pz5H72H2f25)w76bJ!U*EIk%RpfuQzbSy5s%&6rZSvb}@LK$|?Fs{G@CEr9
z9?@np2e42WHWjpJ8q2NW@wavCL6QYS?rTOcd;^3!jHPk>qAJEgs08%VI30A6?HBTc
zos3OB_9iU_cbq~$>eQT^ud#z9DwN3@{+;}Em4>^1giNGCaWsP5;D@f)kAPJ$T}q4Y
zx<_OH(}04#5cipHDgz3HSB1$UfV$uV<DG(8=f|gm*7iKfguY8s`sU?F!T}^p#wX-3
z%?b_^C(%WJiJfcjacC)JWJlfaS*kihWeKuU6%?S=yy1CUP~Z1}QD9{S{FN#MDx@db
zW>t6P*?7BcNyE&#t8qT^0E;Bpv@vccZLN+O!OOKd<bi%I63=+2-iHJ#k+_jfu)iqN
zJ3tn81bJkPb)mL&?>-Ybi_O`gPgPE%fsv|hJ712x!ENqbc{@qhs;!tkL7)1D_n`_~
zipYAuo&`EX+9}U(J{FD=Igs&seO2Vrut_a<8%QQ+H+bL6TbQLv%0j4{8ArVrnS3Zq
zKBCt>5Sz4k_<Ytb@iOO!>QR?+@lNyL56@|;NnEv}&3c!ajoAx5IiI)Iw@(K6<&9PD
z6UKJ|D9dcZ-$2lD)2bkdOT+8$?^fF%MhZB%4v;l)=7+zPFdt2@E~d;yn}ttXGkV)7
zB6b+fbiZ+>{T%uLdf}(C!A@FEd;kYkLFmm>@54}k!iS&1y8BWy9ktpY`@9nK+#gHK
zTo$ptwLV<a^5uPrC1VuNL1}B8GvULs_s^rt&TATWyVECe`~X+rc%Ogtc&FGv92<bg
z9{{Wl3BcL_P&MQo?d+ZCjqDtMH#h*h@;_7!K+{FU>Bt5A=6EtM2;c7Ip+gnEtLK?w
zm`X`!OuMVCGOI!qShO*|b$kHFalqKfj6P`R^<H)ULIZOFv=SmYfe0nE>t~FH!IYhQ
zfBU^lJgY|8QBxV&v=}&GfB8&iMEN@vTve-LP6f~oC#eN;p{co5g(L%;H4G`MAMrOU
zTPhvJ6U9s;(dDRp-+YGlUQ+dul7eQmpYz2bvT+%aah6r#vcAl)9}+%R^pJdf7}K08
zXyKZ2p&L?M66;ZMHjnx_9fDMT!d1Kj@A8cD(xTbvqIMf(-C^ikn^4yWttF7O?Q}>~
z_x=WK`h1<zoMMsFIJ2ziPVYa`4jOD|bq}A`I<wY1EwRr4A8e6Z?$xL3u4wNp7^%~u
zwhs9tdb}UY$BVXd>*ESRY-|+f>`!C6E)4t0<+z_@@R`vMM;;vl8mMA5azgxMCzphw
zpOAy~&Me~}>-0S|f4qlXDLWt@uBUz(KgIZ`UXi%JK*9}Rqay$tMf|Ic8ra+a)+heQ
zL;>~**fJDHWdT(6cH%1ph4VQJg)(THNO`2~?tI1BJL~22zV9`(t2uWU%Pffg&9So+
z<!(H6jBEH>@!t9&E6XZ;nW!GYs$2y&9V2dgz4J_yhRCW=J2aKoC<8A^db-_Y9MOap
z5m*vS%_HdvFK|W*Y)fGrxs3&~dqUL6J@U+IDw^9J@fWl!eWs!{`tH)^Xr<Krkoz6m
z<Kj<M#g4T%825X_@%Uenwd_QM-ezRdLLEy!uvN>ql1P_m?K0;<0O2SY?JDVe14Cda
z@7{o@>-DrE2@Z+C?AGmpEJ5~W-it}anm4>?jQ+?gQ@r!MOLw{H9uW6svR3GX$dt{n
z#T0g*NjfKY@)jKMFAlgIPi_P5QH2h$lANSAAOd+N)IZatrx~uvCF*EuzA0$CK)+(W
zPR9-6Ih?cZh%w%J^5l!+KJHFZSKjrhW<b3B<U^7on{8?ndHg)KDc02O{m$~}%t(32
zZFicUzt)?+l?`a}_kSa**WZMh+yW%R0E5K>>;WCBjES>>v4OJzy|jU$iM52Ssoif@
zs~q!R+%KPgbNW%mOm((K&~>yKl`es31)3>h_1P)Kr;EfkOSfbgCl0s$QE}LD(7SCX
zO-MWL83Cr{Cugl8ZZ0YbC3(^^A=&B}If~ud39>_m4+VtU-aUmvim6eKggD#YhUF=6
zc~mw(O(#jYt!1KUxDVhMrbDAdeY)@i)dWn?sID<q-P>a57hwvrJO%frGA-!8eOqyq
z6}0le9#kU#_Q)5J2rrMoVL|jG$?EfS;)c!qJtRGZ4Wey=;-ZY%pv7cjKCxOFG&km|
zgRKj{a@sHy;a1&XZOl_Np5YhVpJ~fnb?>W-f2NzjAhdwg@}FPC0W{a&wtsjnM_%gh
z3jY3Hz+cg|fE@NOZwdSg{FMOxClmzYAL!7(!vD^8{1XfWqzv;1{D0s+{;KI$2Hc;z
z!T>qqKl0&zRq-o_;7=9dfGqxZ6@O(E{0jedS@2KzJ?6jQ|Gh}~EBMzT!=GSG%0Ixr
z3?P2h@N3fhr-l@&KQ#O;1^yNP_Y~$&G!PIp?eG5g_k`wG_}`<;U*VPXe}VrIYko!l
h>h6EywV3`d&Mz<Z0YG{L0)hp6AOVK@mHD@`{{uRm9!CHG

literal 0
HcmV?d00001

diff --git a/test/test_retrieve_utils.py b/test/test_retrieve_utils.py
index 0585b2a147d9..597808191032 100644
--- a/test/test_retrieve_utils.py
+++ b/test/test_retrieve_utils.py
@@ -17,6 +17,12 @@
 import pytest
 import chromadb
 
+try:
+    from unstructured.partition.auto import partition
+
+    HAS_UNSTRUCTURED = True
+except ImportError:
+    HAS_UNSTRUCTURED = False
 
 test_dir = os.path.join(os.path.dirname(__file__), "test_files")
 expected_text = """AutoGen is an advanced tool designed to assist developers in harnessing the capabilities
@@ -43,7 +49,10 @@ def test_split_files_to_chunks(self):
         pdf_file_path = os.path.join(test_dir, "example.pdf")
         txt_file_path = os.path.join(test_dir, "example.txt")
         chunks = split_files_to_chunks([pdf_file_path, txt_file_path])
-        assert all(isinstance(chunk, str) and chunk.strip() for chunk in chunks)
+        assert all(
+            isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
+            for chunk in chunks
+        )
 
     def test_get_files_from_dir(self):
         files = get_files_from_dir(test_dir)
@@ -158,12 +167,21 @@ def custom_text_split_function(text):
         results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1)
         assert (
             results.get("documents")[0][0]
-            == "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities\nof Large Language Models (LLMs) for various applications. The primary purpose o"
+            == "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities of Large Language Models (LLMs) for various applications. The primary purpose "
         )
 
+    @pytest.mark.skipif(
+        HAS_UNSTRUCTURED,
+        reason="do not run if unstructured is installed",
+    )
     def test_retrieve_utils(self):
         client = chromadb.PersistentClient(path="/tmp/chromadb")
-        create_vector_db_from_dir(dir_path="./website/docs", client=client, collection_name="autogen-docs")
+        create_vector_db_from_dir(
+            dir_path="./website/docs",
+            client=client,
+            collection_name="autogen-docs",
+            get_or_create=True,
+        )
         results = query_vector_db(
             query_texts=[
                 "How can I use AutoGen UserProxyAgent and AssistantAgent to do code generation?",
@@ -176,6 +194,20 @@ def test_retrieve_utils(self):
         print(results["ids"][0])
         assert len(results["ids"][0]) == 4
 
+    @pytest.mark.skipif(
+        not HAS_UNSTRUCTURED,
+        reason="do not run if unstructured is not installed",
+    )
+    def test_unstructured(self):
+        pdf_file_path = os.path.join(test_dir, "example.pdf")
+        txt_file_path = os.path.join(test_dir, "example.txt")
+        word_file_path = os.path.join(test_dir, "example.docx")
+        chunks = split_files_to_chunks([pdf_file_path, txt_file_path, word_file_path])
+        assert all(
+            isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
+            for chunk in chunks
+        )
+
 
 if __name__ == "__main__":
     pytest.main()

From 33b610af2b3e52acded2eb408950c0dde416f7b2 Mon Sep 17 00:00:00 2001
From: Li Jiang <bnujli@gmail.com>
Date: Tue, 31 Oct 2023 22:21:25 +0800
Subject: [PATCH 2/6] Fix tests

---
 test/test_retrieve_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_retrieve_utils.py b/test/test_retrieve_utils.py
index 597808191032..b74ce75c4641 100644
--- a/test/test_retrieve_utils.py
+++ b/test/test_retrieve_utils.py
@@ -166,8 +166,8 @@ def custom_text_split_function(text):
         )
         results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1)
         assert (
-            results.get("documents")[0][0]
-            == "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities of Large Language Models (LLMs) for various applications. The primary purpose "
+            "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities"
+            in results.get("documents")[0][0]
         )
 
     @pytest.mark.skipif(

From dc6f7cc4fdfb28fbe00574e0a443e7b44acf2ab8 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Wed, 1 Nov 2023 09:33:31 +0000
Subject: [PATCH 3/6] Add test and documents

---
 .github/workflows/build.yml                   |  1 +
 .../blog/2023-10-18-RetrieveChat/index.mdx    | 11 +++++++
 website/docs/Installation.md                  | 30 +++++++++++++++----
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 5e5fd186beac..7fe4cc4fa738 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -40,6 +40,7 @@ jobs:
           python -m pip install --upgrade pip wheel
           pip install -e .
           python -c "import autogen"
+          pip install "unstructured[docx,pptx]"
           pip install -e.[mathchat,retrievechat,test] datasets pytest
           pip uninstall -y openai
       - name: Test with pytest
diff --git a/website/blog/2023-10-18-RetrieveChat/index.mdx b/website/blog/2023-10-18-RetrieveChat/index.mdx
index 71d2ad3f46c0..f5c5ae56d18b 100644
--- a/website/blog/2023-10-18-RetrieveChat/index.mdx
+++ b/website/blog/2023-10-18-RetrieveChat/index.mdx
@@ -54,6 +54,16 @@ Please install pyautogen with the [retrievechat] option before using RAG agents.
 pip install "pyautogen[retrievechat]"
 ```
 
+RetrieveChat can handle various types of documents. By default, it can process
+plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
+'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
+If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
+(`pip install "unstructured[all-docs]"`), additional document types such as 'doc',
+'odt', 'png', 'epub', 'jpeg', 'pptx', 'xlsx', 'ppt', 'eml', 'docx' and 'msg' will
+also be supported.
+
+You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
+
 1. Import Agents
 ```python
 from autogen
@@ -474,3 +484,4 @@ The online app and the source code are hosted in [HuggingFace](https://huggingfa
 You can check out more example notebooks for RAG use cases:
 - [Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
 - [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
+- [Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)
diff --git a/website/docs/Installation.md b/website/docs/Installation.md
index 2cacceda2c08..9195840cc49b 100644
--- a/website/docs/Installation.md
+++ b/website/docs/Installation.md
@@ -50,7 +50,7 @@ conda install pyautogen -c conda-forge
 ``` -->
 
 ### Optional Dependencies
-* docker
+- #### docker
 
 For the best user experience and seamless code execution, we highly recommend using Docker with AutoGen. Docker is a containerization platform that simplifies the setup and execution of your code. Developing in a docker container, such as GitHub Codespace, also makes the development convenient.
 
@@ -59,7 +59,7 @@ When running AutoGen out of a docker container, to use docker for code execution
 pip install docker
 ```
 
-* blendsearch
+- #### blendsearch
 
 AutoGen offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Please install with the [blendsearch] option to use it.
 ```bash
@@ -67,21 +67,38 @@ pip install "pyautogen[blendsearch]"
 ```
 
 Example notebooks:
-[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb),
+
+[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb)
+
 [Optimize for Math](https://github.com/microsoft/autogen/blob/main/notebook/oai_chatgpt_gpt4.ipynb)
 
-* retrievechat
+- #### retrievechat
 
 AutoGen supports retrieval-augmented generation tasks such as question answering and code generation with RAG agents. Please install with the [retrievechat] option to use it.
 ```bash
 pip install "pyautogen[retrievechat]"
 ```
 
+RetrieveChat can handle various types of documents. By default, it can process
+plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
+'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
+If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
+(`pip install "unstructured[all-docs]"`), additional document types such as 'doc',
+'odt', 'png', 'epub', 'jpeg', 'pptx', 'xlsx', 'ppt', 'eml', 'docx' and 'msg' will
+also be supported.
+
+You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
+
 Example notebooks:
-[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb),
+
+[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
+
 [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
 
-* mathchat
+[Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)
+
+
+- #### mathchat
 
 AutoGen offers an experimental agent for math problem solving. Please install with the [mathchat] option to use it.
 ```bash
@@ -89,4 +106,5 @@ pip install "pyautogen[mathchat]"
 ```
 
 Example notebooks:
+
 [Using MathChat to Solve Math Problems](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_MathChat.ipynb)

From fa4523efee91148be6c380e674743a8ca2d42f89 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Wed, 1 Nov 2023 09:42:31 +0000
Subject: [PATCH 4/6] Fix tests

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7fe4cc4fa738..0acafd7cc0a5 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -40,7 +40,7 @@ jobs:
           python -m pip install --upgrade pip wheel
           pip install -e .
           python -c "import autogen"
-          pip install "unstructured[docx,pptx]"
+          pip install "unstructured[all-docs]"
           pip install -e.[mathchat,retrievechat,test] datasets pytest
           pip uninstall -y openai
       - name: Test with pytest

From 47b744f33c98656ce36c9c780bef2a403a60d026 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Wed, 1 Nov 2023 10:37:23 +0000
Subject: [PATCH 5/6] Fix tests

---
 autogen/retrieve_utils.py                     | 24 +------------------
 test/test_retrieve_utils.py                   |  5 +---
 .../blog/2023-10-18-RetrieveChat/index.mdx    |  5 ++--
 website/docs/Installation.md                  |  5 ++--
 4 files changed, 6 insertions(+), 33 deletions(-)

diff --git a/autogen/retrieve_utils.py b/autogen/retrieve_utils.py
index 217caa5949eb..b98ba862d1a0 100644
--- a/autogen/retrieve_utils.py
+++ b/autogen/retrieve_utils.py
@@ -40,29 +40,7 @@
     "yml",
     "pdf",
 ]
-UNSTRUCTURED_FORMATS = [
-    "eml",
-    "html",
-    "json",
-    "md",
-    "msg",
-    "rst",
-    "rtf",
-    "txt",
-    "xml",
-    "csv",
-    "doc",
-    "docx",
-    "epub",
-    "odt",
-    "pdf",
-    "ppt",
-    "pptx",
-    "tsv",
-    "xlsx",
-    "jpeg",
-    "png",
-]
+UNSTRUCTURED_FORMATS = ["docx", "doc", "odt", "pptx", "ppt", "xlsx", "eml", "msg", "epub"]
 if HAS_UNSTRUCTURED:
     TEXT_FORMATS += UNSTRUCTURED_FORMATS
     TEXT_FORMATS = list(set(TEXT_FORMATS))
diff --git a/test/test_retrieve_utils.py b/test/test_retrieve_utils.py
index b74ce75c4641..7338f731517f 100644
--- a/test/test_retrieve_utils.py
+++ b/test/test_retrieve_utils.py
@@ -14,6 +14,7 @@
 from autogen.token_count_utils import count_token
 
 import os
+import sys
 import pytest
 import chromadb
 
@@ -170,10 +171,6 @@ def custom_text_split_function(text):
             in results.get("documents")[0][0]
         )
 
-    @pytest.mark.skipif(
-        HAS_UNSTRUCTURED,
-        reason="do not run if unstructured is installed",
-    )
     def test_retrieve_utils(self):
         client = chromadb.PersistentClient(path="/tmp/chromadb")
         create_vector_db_from_dir(
diff --git a/website/blog/2023-10-18-RetrieveChat/index.mdx b/website/blog/2023-10-18-RetrieveChat/index.mdx
index f5c5ae56d18b..362e5a5cbf3a 100644
--- a/website/blog/2023-10-18-RetrieveChat/index.mdx
+++ b/website/blog/2023-10-18-RetrieveChat/index.mdx
@@ -58,9 +58,8 @@ RetrieveChat can handle various types of documents. By default, it can process
 plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
 If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
-(`pip install "unstructured[all-docs]"`), additional document types such as 'doc',
-'odt', 'png', 'epub', 'jpeg', 'pptx', 'xlsx', 'ppt', 'eml', 'docx' and 'msg' will
-also be supported.
+(`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
+'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.
 
 You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
 
diff --git a/website/docs/Installation.md b/website/docs/Installation.md
index 9195840cc49b..e32f4b58d025 100644
--- a/website/docs/Installation.md
+++ b/website/docs/Installation.md
@@ -83,9 +83,8 @@ RetrieveChat can handle various types of documents. By default, it can process
 plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
 If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
-(`pip install "unstructured[all-docs]"`), additional document types such as 'doc',
-'odt', 'png', 'epub', 'jpeg', 'pptx', 'xlsx', 'ppt', 'eml', 'docx' and 'msg' will
-also be supported.
+(`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
+'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.
 
 You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
 

From 9f060df3844c29820d1562d408f4630f9e932ef9 Mon Sep 17 00:00:00 2001
From: Li Jiang <bnujli@gmail.com>
Date: Wed, 1 Nov 2023 20:15:57 +0800
Subject: [PATCH 6/6] Test unstructured on linux and mac

---
 .github/workflows/build.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0acafd7cc0a5..a59879ff8a37 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -40,9 +40,12 @@ jobs:
           python -m pip install --upgrade pip wheel
           pip install -e .
           python -c "import autogen"
-          pip install "unstructured[all-docs]"
           pip install -e.[mathchat,retrievechat,test] datasets pytest
           pip uninstall -y openai
+      - name: Install unstructured if not windows
+        if: matrix.os != 'windows-2019'
+        run: |
+          pip install "unstructured[all-docs]"
       - name: Test with pytest
         if: matrix.python-version != '3.10'
         run: |