From 68cce1adbd20bc051af313d0f502d5c38582d48b Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Mon, 7 Aug 2023 19:20:58 +0200 Subject: [PATCH] fix(dict): Remove only corrections if a space could be inserted as well MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The typo dictionary words.csv previously contained a bunch of problematic entries such as: abouta,about algorithmi,algorithm attachen,attach shouldbe,should anumber,number Which resulted in wrong automatic corrections if the following spaces (indicated by ␣) were accidentally missed: about␣a algorithm␣i developed attach␣en masse should␣be a␣number Many of these entries were introduced by taking entries from the codespell-dict and removing corrections containing spaces (since typos currently doesn't support them), e.g the codespell dictionary contains: abouta->about a, about, shouldbe->should, should be, This commit updates `tests/verify.rs` to automatically remove corrections in the form of `{correction}{common_word},{correction}` or `{common_word}{correction},{correction}`, where `{common_word}` is one of the 1000 most frequent English words (except if `{correction}` also ends/starts in `{common_word}`, since we still want to correct e.g. "extrememe" to "extreme"). The top-1000-most-frequent-words.csv file was generated by running: curl https://norvig.com/ngrams/count_1w.txt \ | head -n1024 \ | awk '{print $1;}' \ | grep -vE '^([^ia]|al|re)$' \ > top-1000-most-frequent-words.csv --- .../assets/top-1000-most-frequent-words.csv | 1000 +++++++++++++++++ crates/typos-dict/assets/words.csv | 152 +-- crates/typos-dict/src/dict_codegen.rs | 154 ++- crates/typos-dict/tests/verify.rs | 85 +- 4 files changed, 1229 insertions(+), 162 deletions(-) create mode 100644 crates/typos-dict/assets/top-1000-most-frequent-words.csv diff --git a/crates/typos-dict/assets/top-1000-most-frequent-words.csv b/crates/typos-dict/assets/top-1000-most-frequent-words.csv new file mode 100644 index 000000000..8de101f40 --- /dev/null +++ b/crates/typos-dict/assets/top-1000-most-frequent-words.csv @@ -0,0 +1,1000 @@ +the +of +and +to +a +in +for +is +on +that +by +this +with +i +you +it +not +or +be +are +from +at +as +your +all +have +new +more +an +was +we +will +home +can +us +about +if +page +my +has +search +free +but +our +one +other +do +no +information +time +they +site +he +up +may +what +which +their +news +out +use +any +there +see +only +so +his +when +contact +here +business +who +web +also +now +help +get +pm +view +online +first +am +been +would +how +were +me +services +some +these +click +its +like +service +than +find +price +date +back +top +people +had +list +name +just +over +state +year +day +into +email +two +health +world +next +used +go +work +last +most +products +music +buy +data +make +them +should +product +system +post +her +city +add +policy +number +such +please +available +copyright +support +message +after +best +software +then +jan +good +video +well +where +info +rights +public +books +high +school +through +each +links +she +review +years +order +very +privacy +book +items +company +read +group +sex +need +many +user +said +de +does +set +under +general +research +university +january +mail +full +map +reviews +program +life +know +games +way +days +management +part +could +great +united +hotel +real +item +international +center +ebay +must +store +travel +comments +made +development +report +off +member +details +line +terms +before +hotels +did +send +right +type +because +local +those +using +results +office +education +national +car +design +take +posted +internet +address +community +within +states +area +want +phone +dvd +shipping +reserved +subject +between +forum +family +long +based +code +show +even +black +check +special +prices +website +index +being +women +much +sign +file +link +open +today +technology +south +case +project +same +pages +uk +version +section +own +found +sports +house +related +security +both +county +american +photo +game +members +power +while +care +network +down +computer +systems +three +total +place +end +following +download +him +without +per +access +think +north +resources +current +posts +big +media +law +control +water +history +pictures +size +art +personal +since +including +guide +shop +directory +board +location +change +white +text +small +rating +rate +government +children +during +usa +return +students +shopping +account +times +sites +level +digital +profile +previous +form +events +love +old +john +main +call +hours +image +department +title +description +non +insurance +another +why +shall +property +class +cd +still +money +quality +every +listing +content +country +private +little +visit +save +tools +low +reply +customer +december +compare +movies +include +college +value +article +york +man +card +jobs +provide +food +source +author +different +press +learn +sale +around +print +course +job +canada +process +teen +room +stock +training +too +credit +point +join +science +men +categories +advanced +west +sales +look +english +left +team +estate +box +conditions +select +windows +photos +gay +thread +week +category +note +live +large +gallery +table +register +however +june +october +november +market +library +really +action +start +series +model +features +air +industry +plan +human +provided +tv +yes +required +second +hot +accessories +cost +movie +forums +march +la +september +better +say +questions +july +yahoo +going +medical +test +friend +come +dec +server +pc +study +application +cart +staff +articles +san +feedback +again +play +looking +issues +april +never +users +complete +street +topic +comment +financial +things +working +against +standard +tax +person +below +mobile +less +got +blog +party +payment +equipment +login +student +let +programs +offers +legal +above +recent +park +stores +side +act +problem +red +give +memory +performance +social +august +quote +language +story +sell +options +experience +rates +create +key +body +young +america +important +field +few +east +paper +single +ii +age +activities +club +example +girls +additional +password +latest +something +road +gift +question +changes +night +ca +hard +texas +oct +pay +four +poker +status +browse +issue +range +building +seller +court +february +always +result +audio +light +write +war +nov +offer +blue +groups +easy +given +files +event +release +analysis +request +fax +china +making +picture +needs +possible +might +professional +yet +month +major +star +areas +future +space +committee +hand +sun +cards +problems +london +washington +meeting +rss +become +interest +id +child +keep +enter +california +porn +share +similar +garden +schools +million +added +reference +companies +listed +baby +learning +energy +run +delivery +net +popular +term +film +stories +put +computers +journal +reports +co +try +welcome +central +images +president +notice +god +original +head +radio +until +cell +color +self +council +away +includes +track +australia +discussion +archive +once +others +entertainment +agreement +format +least +society +months +log +safety +friends +sure +faq +trade +edition +cars +messages +marketing +tell +further +updated +association +able +having +provides +david +fun +already +green +studies +close +common +drive +specific +several +gold +feb +living +sep +collection +called +short +arts +lot +ask +display +limited +powered +solutions +means +director +daily +beach +past +natural +whether +due +et +electronics +five +upon +period +planning +database +says +official +weather +mar +land +average +done +technical +window +france +pro +region +island +record +direct +microsoft +conference +environment +records +st +district +calendar +costs +style +url +front +statement +update +parts +aug +ever +downloads +early +miles +sound +resource +present +applications +either +ago +document +word +works +material +bill +apr +written +talk +federal +hosting +rules +final +adult +tickets +thing +centre +requirements +via +cheap +nude +kids +finance +true +minutes +else +mark +third +rock +gifts +europe +reading +topics +bad +individual +tips +plus +auto +cover +usually +edit +together +videos +percent +fast +function +fact +unit +getting +global +tech +meet +far +economic +en +player +projects +lyrics +often +subscribe +submit +germany +amount +watch +included +feel +though +bank +risk +thanks +everything +deals +various +words +linux +jul +production +commercial +james +weight +town +heart +advertising +received +choose +treatment +newsletter +archives +points +knowledge +magazine +error +camera +jun +girl +currently +construction +toys +registered +clear +golf +receive +domain +methods +chapter +makes +protection +policies +loan +wide +beauty +manager +india +position +taken +sort +listings +models +michael +known +half +cases +step +engineering +florida +simple +quick +none +wireless +license +paul +friday +lake +whole +annual +published +later +basic +sony +shows +corporate +google +church +method +purchase +customers +active +response +practice +hardware +figure +materials +fire +holiday +chat +enough +designed +along +among +death +writing +speed +html +countries +loss +face +brand +discount +higher +effects +created +remember +standards +oil +bit +yellow +political +increase +advertise +kingdom +base +near +environmental +thought +stuff +french +storage +oh +japan +doing +loans +shoes +entry +stay +nature +orders +availability +africa +summary +turn +mean +growth +notes +agency +king +monday +european +activity +copy +although +drug +pics +western +income +force +cash +employment +overall +bay diff --git a/crates/typos-dict/assets/words.csv b/crates/typos-dict/assets/words.csv index 8d8af1eeb..c2583c008 100644 --- a/crates/typos-dict/assets/words.csv +++ b/crates/typos-dict/assets/words.csv @@ -1,32 +1,32 @@ -aaccess,access -aaccessibility,accessibility -aaccession,accession +aaccess +aaccessibility +aaccession aache,ache,cache -aack,ack -aactual,actual -aactually,actually -aadd,add -aadded,added -aadding,adding -aagain,again -aaggregation,aggregation -aand,and -aanother,another -aapply,apply +aack +aactual +aactually +aadd +aadded +aadding +aagain +aaggregation +aand +aanother +aapply aaproximate,approximate aaproximated,approximated aaproximately,approximately aaproximates,approximates aaproximating,approximating -aare,are +aare aas,as,ass -aassign,assign -aassignment,assignment -aassignments,assignments -aassociated,associated -aassumed,assumed -aautomatic,automatic -aautomatically,automatically +aassign +aassignment +aassignments +aassociated +aassumed +aautomatic +aautomatically abailable,available abanden,abandon abandenment,abandonment @@ -139,7 +139,7 @@ abotu,about abou,about,abound abount,about abourt,abort,about -abouta,about +abouta abouve,above abov,above aboved,above @@ -1586,8 +1586,8 @@ advsiors,advisors adwances,advances aeactivate,deactivate,activate aeorspace,aerospace -aequidistant,equidistant -aequivalent,equivalent +aequidistant +aequivalent aeriel,aerial aeriels,aerials aeropsace,aerospace @@ -1675,7 +1675,7 @@ afhganistan,afghanistan afinity,affinity afircan,african afircans,africans -afor,for +afor aforememtioned,aforementioned aforementioend,aforementioned aforementiond,aforementioned @@ -2087,7 +2087,7 @@ algorithimical,algorithmic,algorithmically algorithimically,algorithmically algorithims,algorithm,algorithms algorithmes,algorithms -algorithmi,algorithm +algorithmi algorithmical,algorithmically algorithmm,algorithm algorithmmic,algorithmic @@ -2372,7 +2372,7 @@ allocal,allocate allocarion,allocation allocat,allocate allocatbale,allocatable -allocatedi,allocated +allocatedi allocatedp,allocated allocateing,allocating allocateng,allocating @@ -2827,7 +2827,7 @@ amrpits,armpits amrstrong,armstrong amtheyst,amethyst amublance,ambulance -amuch,much +amuch amung,among amunition,ammunition amunt,amount @@ -3407,7 +3407,7 @@ anulled,annulled anulling,annulling anulls,annulled anuls,annulls -anumber,number +anumber anurism,aneurism anuwhere,anywhere anway,anyway @@ -4599,7 +4599,7 @@ ascneding,ascending ascnesion,ascension ascpect,aspect ascpects,aspects -ascripts,scripts +ascripts asdignment,assignment asdignments,assignments asemble,assemble @@ -5102,7 +5102,7 @@ astronouts,astronauts astronuat,astronaut astronuats,astronauts astrounat,astronaut -asudo,sudo +asudo asume,assume asumed,assumed asumes,assumes @@ -5273,7 +5273,7 @@ attachement,attachment attachements,attachments attachemnt,attachment attachemnts,attachments -attachen,attach +attachen attachged,attached attachmant,attachment attachmants,attachments @@ -5397,7 +5397,7 @@ attriburted,attributed attriburtes,attributes attriburtion,attribution attribut,attribute -attributei,attribute +attributei attributen,attribute attributess,attributes attributo,attribution @@ -6520,7 +6520,7 @@ beautyfull,beautiful beautyfully,beautifully beavior,behavior beaviour,behaviour -bebefore,before +bebefore bebongs,belongs bebore,before becaause,because @@ -7373,7 +7373,7 @@ bounbdaries,boundaries bounbdary,boundary boundaires,boundaries boundares,boundaries -boundaryi,boundary +boundaryi boundarys,boundaries bounday,boundary boundays,boundaries @@ -7636,7 +7636,7 @@ broadcas,broadcast broadcase,broadcast broadcasing,broadcasting broadcastes,broadcasts -broadcasti,broadcast +broadcasti broadcastors,broadcasts broadcat,broadcasts,broadcast broadley,broadly @@ -8887,7 +8887,7 @@ catastrphic,catastrophic cataylst,catalyst catche,catch catched,caught -catchi,catch +catchi catchip,catchup catchs,catches categogical,categorical @@ -10425,7 +10425,7 @@ clickbat,clickbait clickear,clicker clien,client cliens,clients -clienta,client +clienta cliente,client,clientele clientelle,clientele clientes,clients @@ -11170,7 +11170,7 @@ commandent,commandment commandered,commanded commandes,commands commandeur,commanders -commandi,command +commandi commandmant,commandment commandmants,commandments commandmends,commandments @@ -12937,7 +12937,7 @@ connectinos,connections connectins,connects,connections connectiom,connection connectioms,connections -connectiona,connection +connectiona connectionas,connections connectiong,connecting connectit,connecticut @@ -13549,7 +13549,7 @@ contails,contains contaiminate,contaminate contaiminated,contaminated contaiminating,contaminating -containa,contain +containa containd,contained containe,contain,contained,container,contains containees,containers @@ -15066,7 +15066,7 @@ creamic,ceramic creasoat,creosote creastor,creator creatation,creation -createa,create +createa createable,creatable createdd,created createin,creatine @@ -15394,7 +15394,7 @@ cuestion,question cuestionable,questionable cuestioned,questioned cuestions,questions -cuileoga,cuileog +cuileoga culiminating,culminating culitvate,cultivate culprint,culprit @@ -17202,7 +17202,7 @@ dependenices,dependencies dependenies,dependencies dependening,depending dependensies,dependencies -dependenta,dependent +dependenta dependente,dependence dependeny,dependency dependet,dependent @@ -18363,7 +18363,7 @@ dictrionaries,dictionaries dictrionary,dictionary dicussed,discussed dicussions,discussions -didi,did +didi didsapointed,disappointed diea,idea,die diect,direct @@ -20188,7 +20188,7 @@ downgradde,downgrade downgradded,downgraded downgraddes,downgrades downgradding,downgrading -downgradei,downgrade +downgradei downgradingn,downgrading downgrate,downgrade downgrated,downgrade,downgraded @@ -21341,7 +21341,7 @@ enabeled,enabled enabeling,enabling enabels,enables enabing,enabling -enabledi,enabled +enabledi enableing,enabling enablen,enabled enahnces,enhances @@ -25970,7 +25970,7 @@ fragmet,fragment fragmnet,fragment frambuffer,framebuffer framebufer,framebuffer -framei,frame +framei frament,fragment framented,fragmented framents,fragments @@ -27781,7 +27781,7 @@ harvestgain,harvesting harware,hardware harwdare,hardware hases,hashes -hashi,hash +hashi hashs,hashes hashses,hashes hasing,hashing @@ -27798,7 +27798,7 @@ hauntig,haunting hauty,haughty hav,have,half hava,have -havea,have +havea havee,have haveing,having haversting,harvesting @@ -28904,11 +28904,11 @@ igzorting,exhorting igzorts,exhorts ihaca,ithaca ihs,his -iif,if -iimmune,immune -iinclude,include -iinterval,interval -iiterator,iterator +iif +iimmune +iinclude +iinterval +iiterator iland,island ileagle,illegal ilegal,illegal @@ -29026,7 +29026,7 @@ imaginaton,imagination imaginery,imaginary,imagery imaginitave,imaginative imaginitve,imaginative -imakes,makes +imakes imanent,eminent,imminent imapct,impact imapcted,impacted @@ -29769,7 +29769,7 @@ incliuding,including inclode,include inclreased,increased includ,include -includea,include +includea includeds,includes,included includee,include includeing,including @@ -30127,7 +30127,7 @@ indentifies,identifies indentifing,identifying indentify,identify indentifying,identifying -indentin,indent +indentin indentit,identity indentity,identity indentleveal,indentlevel @@ -31404,7 +31404,7 @@ installataions,installations installatation,installation installatin,installations installating,installation -installationa,installation +installationa installatons,installations installatron,installation installe,installer,installed,install @@ -32924,7 +32924,7 @@ isdefinitely,indefinitely iself,itself iselfe,itself iserting,inserting -isimilar,similar +isimilar islamisist,islamist islamisists,islamists islamiskt,islamist @@ -33040,8 +33040,8 @@ iterstions,iterations itertation,iteration iteself,itself itesm,items -itheir,their -itheirs,theirs +itheir +itheirs itialise,initialise itialised,initialised itialises,initialises @@ -33089,7 +33089,7 @@ iverse,diverse,inverse iversed,inverse,inversed ivocation,invocation ivoked,invoked -iwithout,without +iwithout iwll,will iwth,with jackonsville,jacksonville @@ -35859,7 +35859,7 @@ membrances,membrane membrans,membranes memcahe,memcache memcahed,memcached -memeasurement,measurement +memeasurement memeber,member memebered,remembered memebers,members @@ -35870,7 +35870,7 @@ memebrof,memberof memebrs,members memember,member,remember memembers,members,remembers -mememory,memory +mememory mememto,memento memeory,memory memer,member @@ -37293,7 +37293,7 @@ morbidy,morbidly morbildy,morbidly mordern,modern mordibly,morbidly -moreso,more +moreso morever,moreover morevoer,moreover morg,morgue @@ -43627,7 +43627,7 @@ positionned,positioned positionnes,positions positionning,positioning positionns,positions -positionof,position +positionof positiv,positive positiveity,positivity positivie,positive @@ -46649,7 +46649,7 @@ reactquire,reacquire readabilty,readability readanle,readable readble,readable -readby,read +readby readdrss,readdress readdrssed,readdressed readdrsses,readdresses @@ -52211,7 +52211,7 @@ shoudlnt,shouldnt shoudn,shouldn shoudt,should shoul,should,shawl,shoal -shouldbe,should +shouldbe shouldes,shoulders shouldnot,shouldnt shouldt,shouldnt @@ -52465,7 +52465,7 @@ simiilar,similar similair,similar similairty,similarity similaraties,similarities -similari,similar +similari similarily,similarly similarites,similarities similarlity,similarity @@ -60871,7 +60871,7 @@ upgradde,upgrade upgradded,upgraded upgraddes,upgrades upgradding,upgrading -upgradei,upgrade +upgradei upgradingn,upgrading upgrads,upgrades upgrate,upgrade @@ -61755,7 +61755,7 @@ vieports,viewports vietmanese,vietnamese vietnamees,vietnamese vietnameese,vietnamese -vietnamesea,vietnamese +vietnamesea vietnamesse,vietnamese vietnamiese,vietnamese vietnamnese,vietnamese @@ -61896,7 +61896,7 @@ visibiltiy,visibility visibilty,visibility visibily,visibility,visibly visibl,visible -visibleable,visible +visibleable visibles,visible visibley,visibly visiblities,visibilities @@ -62727,7 +62727,7 @@ witdhs,widths witdth,width witdths,widths wite,write,white -witha,with +witha withces,witches withdral,withdrawal withdrawalls,withdrawals @@ -63059,7 +63059,7 @@ xinitiazlize,xinitialize xmdoel,xmodel xode,code,xcode xour,your -xwindows,x +xwindows xyou,you yaching,yachting yaer,year diff --git a/crates/typos-dict/src/dict_codegen.rs b/crates/typos-dict/src/dict_codegen.rs index 31c764c15..5bae31ca4 100644 --- a/crates/typos-dict/src/dict_codegen.rs +++ b/crates/typos-dict/src/dict_codegen.rs @@ -655,7 +655,7 @@ pub static WORD_X_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dictge &["xmodel"], &["code", "xcode"], &["your"], - &["x"], + &[], &["you"], ], range: 3..=11, @@ -1924,7 +1924,7 @@ pub static WORD_WITH_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dic dictgen::InsensitiveStr::Ascii("uout"), ], values: &[ - &["with"], + &[], &["witches"], &["withdrawal"], &["withdrawals"], @@ -5062,7 +5062,7 @@ pub static WORD_VISI_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dic &["visibility"], &["visibility", "visibly"], &["visible"], - &["visible"], + &[], &["visible"], &["visibly"], &["visibilities"], @@ -5482,7 +5482,7 @@ pub static WORD_VIE_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict &["vietnamese"], &["vietnamese"], &["vietnamese"], - &["vietnamese"], + &[], &["vietnamese"], &["vietnamese"], &["vietnamese"], @@ -8513,7 +8513,7 @@ pub static WORD_UPG_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict &["upgraded"], &["upgrades"], &["upgrading"], - &["upgrade"], + &[], &["upgrading"], &["upgrades"], &["upgrade"], @@ -36550,7 +36550,7 @@ pub static WORD_SIMI_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dic &["similar"], &["similarity"], &["similarities"], - &["similar"], + &[], &["similarly"], &["similarities"], &["similarity"], @@ -37518,7 +37518,7 @@ pub static WORD_SHO_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict &["shouldn"], &["should"], &["should", "shawl", "shoal"], - &["should"], + &[], &["shoulders"], &["shouldnt"], &["shouldnt"], @@ -56280,7 +56280,7 @@ pub static WORD_READ_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dic &["readability"], &["readable"], &["readable"], - &["read"], + &[], &["readdress"], &["readdressed"], &["readdresses"], @@ -65978,7 +65978,7 @@ pub static WORD_POSI_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dic &["positions"], &["positioning"], &["positions"], - &["position"], + &[], &["positive"], &["positivity"], &["positive"], @@ -87580,7 +87580,7 @@ pub static WORD_MOR_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict &["morbidly"], &["modern"], &["morbidly"], - &["more"], + &[], &["moreover"], &["moreover"], &["morgue"], @@ -92051,7 +92051,7 @@ pub static WORD_MEM_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict &["membranes"], &["memcache"], &["memcached"], - &["measurement"], + &[], &["member"], &["remembered"], &["members"], @@ -92062,7 +92062,7 @@ pub static WORD_MEM_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict &["members"], &["member", "remember"], &["members", "remembers"], - &["memory"], + &[], &["memento"], &["memory"], &["member"], @@ -101218,7 +101218,7 @@ pub static WORD_IW_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dictg dictgen::InsensitiveStr::Ascii("ll"), dictgen::InsensitiveStr::Ascii("th"), ], - values: &[&["without"], &["will"], &["with"]], + values: &[&[], &["will"], &["with"]], range: 2..=6, }; @@ -101443,7 +101443,7 @@ pub static WORD_ITH_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict dictgen::InsensitiveStr::Ascii("eir"), dictgen::InsensitiveStr::Ascii("eirs"), ], - values: &[&["their"], &["theirs"]], + values: &[&[], &[]], range: 3..=4, }; @@ -101847,7 +101847,7 @@ static WORD_ISI_NODE: dictgen::DictTrieNode<&'static [&'static str]> = dictgen:: pub static WORD_ISI_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dictgen::DictTable { keys: &[dictgen::InsensitiveStr::Ascii("milar")], - values: &[&["similar"]], + values: &[&[]], range: 5..=5, }; @@ -106627,7 +106627,7 @@ pub static WORD_INSTAL_CHILDREN: dictgen::DictTable<&'static [&'static str]> = d &["installation"], &["installations"], &["installation"], - &["installation"], + &[], &["installations"], &["installation"], &["installer", "installed", "install"], @@ -110743,7 +110743,7 @@ pub static WORD_INDEN_CHILDREN: dictgen::DictTable<&'static [&'static str]> = di &["identifying"], &["identify"], &["identifying"], - &["indent"], + &[], &["identity"], &["identity"], &["indentlevel"], @@ -111975,7 +111975,7 @@ pub static WORD_INCL_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dic &["include"], &["increased"], &["include"], - &["include"], + &[], &["includes", "included"], &["include"], &["including"], @@ -114678,7 +114678,7 @@ pub static WORD_IMA_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict &["imaginary", "imagery"], &["imaginative"], &["imaginative"], - &["makes"], + &[], &["eminent", "imminent"], &["impact"], &["impacted"], @@ -115108,13 +115108,7 @@ pub static WORD_II_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dictg dictgen::InsensitiveStr::Ascii("nterval"), dictgen::InsensitiveStr::Ascii("terator"), ], - values: &[ - &["if"], - &["immune"], - &["include"], - &["interval"], - &["iterator"], - ], + values: &[&[], &[], &[], &[], &[]], range: 1..=7, }; @@ -119045,7 +119039,7 @@ pub static WORD_HAV_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict ], values: &[ &["have"], - &["have"], + &[], &["have"], &["having"], &["harvesting"], @@ -119113,7 +119107,7 @@ pub static WORD_HAS_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict ], values: &[ &["hashes"], - &["hash"], + &[], &["hashes"], &["hashes"], &["hashing"], @@ -125389,7 +125383,7 @@ pub static WORD_FRAM_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dic values: &[ &["framebuffer"], &["framebuffer"], - &["frame"], + &[], &["fragment"], &["fragmented"], &["fragments"], @@ -140078,7 +140072,7 @@ pub static WORD_ENA_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict &["enabling"], &["enables"], &["enabling"], - &["enabled"], + &[], &["enabling"], &["enabled"], &["enhances"], @@ -144235,7 +144229,7 @@ pub static WORD_DOWNG_CHILDREN: dictgen::DictTable<&'static [&'static str]> = di &["downgraded"], &["downgrades"], &["downgrading"], - &["downgrade"], + &[], &["downgrading"], &["downgrade"], &["downgrade", "downgraded"], @@ -150402,7 +150396,7 @@ pub static WORD_DID_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict dictgen::InsensitiveStr::Ascii("i"), dictgen::InsensitiveStr::Ascii("sapointed"), ], - values: &[&["did"], &["disappointed"]], + values: &[&[], &["disappointed"]], range: 1..=9, }; @@ -154659,7 +154653,7 @@ pub static WORD_DEPEND_CHILDREN: dictgen::DictTable<&'static [&'static str]> = d &["dependencies"], &["depending"], &["dependencies"], - &["dependent"], + &[], &["dependence"], &["dependency"], &["dependent"], @@ -160592,7 +160586,7 @@ static WORD_CUI_NODE: dictgen::DictTrieNode<&'static [&'static str]> = dictgen:: pub static WORD_CUI_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dictgen::DictTable { keys: &[dictgen::InsensitiveStr::Ascii("leoga")], - values: &[&["cuileog"]], + values: &[&[]], range: 5..=5, }; @@ -161727,7 +161721,7 @@ pub static WORD_CREA_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dic &["creosote"], &["creator"], &["creation"], - &["create"], + &[], &["creatable"], &["created"], &["creatine"], @@ -167174,7 +167168,7 @@ pub static WORD_CONTAI_CHILDREN: dictgen::DictTable<&'static [&'static str]> = d &["contaminate"], &["contaminated"], &["contaminating"], - &["contain"], + &[], &["contained"], &["contain", "contained", "container", "contains"], &["containers"], @@ -169305,7 +169299,7 @@ pub static WORD_CONNE_CHILDREN: dictgen::DictTable<&'static [&'static str]> = di &["connects", "connections"], &["connection"], &["connections"], - &["connection"], + &[], &["connections"], &["connecting"], &["connecticut"], @@ -174846,7 +174840,7 @@ pub static WORD_COMMA_CHILDREN: dictgen::DictTable<&'static [&'static str]> = di &["commanded"], &["commands"], &["commanders"], - &["command"], + &[], &["commandment"], &["commandments"], &["commandments"], @@ -177059,7 +177053,7 @@ pub static WORD_CLI_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict &["clicker"], &["client"], &["clients"], - &["client"], + &[], &["client", "clientele"], &["clientele"], &["clients"], @@ -182532,13 +182526,7 @@ pub static WORD_CATC_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dic dictgen::InsensitiveStr::Ascii("hip"), dictgen::InsensitiveStr::Ascii("hs"), ], - values: &[ - &["catch"], - &["caught"], - &["catch"], - &["catchup"], - &["catches"], - ], + values: &[&["catch"], &["caught"], &[], &["catchup"], &["catches"]], range: 2..=3, }; @@ -186510,7 +186498,7 @@ pub static WORD_BROA_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dic &["broadcast"], &["broadcasting"], &["broadcasts"], - &["broadcast"], + &[], &["broadcasts"], &["broadcasts", "broadcast"], &["broadly"], @@ -187504,7 +187492,7 @@ pub static WORD_BOUND_CHILDREN: dictgen::DictTable<&'static [&'static str]> = di values: &[ &["boundaries"], &["boundaries"], - &["boundary"], + &[], &["boundaries"], &["boundary"], &["boundaries"], @@ -190358,7 +190346,7 @@ pub static WORD_BEB_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict dictgen::InsensitiveStr::Ascii("ongs"), dictgen::InsensitiveStr::Ascii("ore"), ], - values: &[&["before"], &["belongs"], &["before"]], + values: &[&[], &["belongs"], &["before"]], range: 3..=5, }; @@ -194551,7 +194539,7 @@ pub static WORD_ATTRI_CHILDREN: dictgen::DictTable<&'static [&'static str]> = di &["attributes"], &["attribution"], &["attribute"], - &["attribute"], + &[], &["attribute"], &["attributes"], &["attribution"], @@ -194869,7 +194857,7 @@ pub static WORD_ATTA_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dic &["attachments"], &["attachment"], &["attachments"], - &["attach"], + &[], &["attached"], &["attachment"], &["attachments"], @@ -195398,7 +195386,7 @@ pub static WORD_ASU_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict dictgen::InsensitiveStr::Ascii("trian"), ], values: &[ - &["sudo"], + &[], &["assume"], &["assumed"], &["assumes"], @@ -196974,7 +196962,7 @@ pub static WORD_ASC_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict &["ascension"], &["aspect"], &["aspects"], - &["scripts"], + &[], ], range: 4..=9, }; @@ -200714,7 +200702,7 @@ pub static WORD_ANU_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict &["annulling"], &["annulled"], &["annulls"], - &["number"], + &[], &["aneurism"], &["anywhere"], ], @@ -202672,7 +202660,7 @@ pub static WORD_AMU_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict ], values: &[ &["ambulance"], - &["much"], + &[], &["among"], &["ammunition"], &["amount"], @@ -204206,7 +204194,7 @@ pub static WORD_ALLO_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dic &["allocation"], &["allocate"], &["allocatable"], - &["allocated"], + &[], &["allocated"], &["allocating"], &["allocating"], @@ -205118,7 +205106,7 @@ pub static WORD_ALGORI_CHILDREN: dictgen::DictTable<&'static [&'static str]> = d &["algorithmically"], &["algorithm", "algorithms"], &["algorithms"], - &["algorithm"], + &[], &["algorithmically"], &["algorithm"], &["algorithmic"], @@ -206374,7 +206362,7 @@ pub static WORD_AFO_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict dictgen::InsensitiveStr::Ascii("rmentioned"), ], values: &[ - &["for"], + &[], &["aforementioned"], &["aforementioned"], &["aforementioned"], @@ -206618,8 +206606,8 @@ pub static WORD_AE_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dictg values: &[ &["deactivate", "activate"], &["aerospace"], - &["equidistant"], - &["equivalent"], + &[], + &[], &["aerial"], &["aerials"], &["aerospace"], @@ -211377,7 +211365,7 @@ pub static WORD_ABO_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dict &["about", "abound"], &["about"], &["abort", "about"], - &["about"], + &[], &["above"], &["above"], &["above"], @@ -211640,35 +211628,35 @@ pub static WORD_AA_CHILDREN: dictgen::DictTable<&'static [&'static str]> = dictg dictgen::InsensitiveStr::Ascii("utomatically"), ], values: &[ - &["access"], - &["accessibility"], - &["accession"], + &[], + &[], + &[], &["ache", "cache"], - &["ack"], - &["actual"], - &["actually"], - &["add"], - &["added"], - &["adding"], - &["again"], - &["aggregation"], - &["and"], - &["another"], - &["apply"], + &[], + &[], + &[], + &[], + &[], + &[], + &[], + &[], + &[], + &[], + &[], &["approximate"], &["approximated"], &["approximately"], &["approximates"], &["approximating"], - &["are"], + &[], &["as", "ass"], - &["assign"], - &["assignment"], - &["assignments"], - &["associated"], - &["assumed"], - &["automatic"], - &["automatically"], + &[], + &[], + &[], + &[], + &[], + &[], + &[], ], range: 1..=12, }; diff --git a/crates/typos-dict/tests/verify.rs b/crates/typos-dict/tests/verify.rs index 560b095eb..0a1aa8e74 100644 --- a/crates/typos-dict/tests/verify.rs +++ b/crates/typos-dict/tests/verify.rs @@ -90,6 +90,7 @@ fn process>( let varcon_words = varcon_words(); let allowed_words = allowed_words(); let word_variants = proper_word_variants(); + let top_1000_most_frequent_english_words = top_1000_most_frequent_english_words(); let rows: Vec<_> = rows .into_iter() .filter(|(typo, _)| { @@ -108,15 +109,27 @@ fn process>( } }) .map(|(typo, corrections)| { - let mut new_corrections = IndexSet::new(); + let mut new_corrections = Vec::new(); for correction in corrections { let correction = word_variants .get(correction.as_str()) .and_then(|words| find_best_match(&typo, correction.as_str(), words)) .unwrap_or(&correction); - new_corrections.insert(correction.to_owned()); + new_corrections.push(correction.to_owned()); } - (typo, new_corrections) + if let [only_correction] = &new_corrections[..] { + if correction_should_contain_space( + &typo, + only_correction, + &top_1000_most_frequent_english_words, + ) { + // We cannot provide corrections since we don't yet support corrections + // containing spaces (see https://github.com/crate-ci/typos/issues/795), + // so we just clear the corrections (which still disallows the typo). + new_corrections.clear(); + } + } + (typo, new_corrections.into_iter().collect::>()) }) .collect(); let mut dict = Dict::new(); @@ -187,6 +200,29 @@ fn test_varcon_best_match() { ); } +#[test] +fn test_remove_only_correction_that_should_contain_space() { + assert_eq!( + process([("includea", ["include"],)]), + dict_from_iter([("includea", [],)]) + ); + + assert_eq!( + process([("ainclude", ["include"],)]), + dict_from_iter([("ainclude", [],)]) + ); + + assert_eq!( + process([("extrememe", ["extreme"],)]), + dict_from_iter([("extrememe", ["extreme"],)]) + ); + + assert_eq!( + process([("mememory", ["memory"],)]), + dict_from_iter([("mememory", ["memory"],)]) + ); +} + fn is_word(word: &str) -> bool { word.chars().all(|c| c.is_alphabetic()) } @@ -253,3 +289,46 @@ fn allowed_words() -> std::collections::HashMap { }) .collect() } + +fn top_1000_most_frequent_english_words() -> HashSet { + std::fs::read_to_string("assets/top-1000-most-frequent-words.csv") + .unwrap() + .lines() + .map(str::to_string) + .collect() +} + +/// Returns true if the typo equals the only correction immediately preceded or +/// followed by a common English word. (Unless the only correction also +/// starts/ends in the common English word.) +/// +/// The reasoning behind this is that we don't want to correct e.g. `includea` +/// to `include` since it might just be missing a space (`include a`). +/// On the other hand we still want to correct e.g. `extrememe` to `extreme`. +fn correction_should_contain_space( + typo: &str, + only_correction: &str, + most_frequent_english_words: &HashSet, +) -> bool { + if let Some(prefix) = typo.strip_suffix(only_correction) { + if most_frequent_english_words.contains(prefix) { + if only_correction.starts_with(prefix) { + return false; + } + + return true; + } + } + + if let Some(suffix) = typo.strip_prefix(only_correction) { + if most_frequent_english_words.contains(suffix) { + if only_correction.ends_with(suffix) { + return false; + } + + return true; + } + } + + false +}