diff --git a/parts/meta/kw_uri_map.txt b/parts/meta/kw_uri_map.txt index 2062cf32..a1d572c8 100644 --- a/parts/meta/kw_uri_map.txt +++ b/parts/meta/kw_uri_map.txt @@ -201,8 +201,20 @@ ENDNUM __RefHeading___Toc123125_83452205 ENDSCALE __RefHeading___Toc68146_2267116897 ENDSKIP __RefHeading___Toc605472_3199477706 ENKRVD __RefHeading___Toc69787_621662414 +ENKRVDX __RefHeading___Toc69787_621662414 +ENKRVDX- __RefHeading___Toc69787_621662414 +ENKRVDY __RefHeading___Toc69787_621662414 +ENKRVDY- __RefHeading___Toc69787_621662414 +ENKRVDZ __RefHeading___Toc69787_621662414 +ENKRVDZ- __RefHeading___Toc69787_621662414 ENPCVD __RefHeading___Toc217353_803326780 ENPTVD __RefHeading___Toc69789_621662414 +ENPTVDX __RefHeading___Toc69789_621662414 +ENPTVDX- __RefHeading___Toc69789_621662414 +ENPTVDY __RefHeading___Toc69789_621662414 +ENPTVDY- __RefHeading___Toc69789_621662414 +ENPTVDZ __RefHeading___Toc69789_621662414 +ENPTVDZ- __RefHeading___Toc69789_621662414 ENSPCVD __RefHeading___Toc227745_803326780 EOS REF_HEADING_KEYWORD_EOS_8_3 EOSNUM REF_HEADING_KEYWORD_EOSNUM_9_3 @@ -397,19 +409,79 @@ HZFIN __RefHeading___Toc227697_2135714711 IHOST __RefHeading___Toc512366_2135714711 IKRG __RefHeading___Toc506847_2135714711 IKRGR __RefHeading___Toc70187_3358172231 +IKRGRX __RefHeading___Toc70187_3358172231 +IKRGRX- __RefHeading___Toc70187_3358172231 +IKRGRY __RefHeading___Toc70187_3358172231 +IKRGRY- __RefHeading___Toc70187_3358172231 +IKRGRZ __RefHeading___Toc70187_3358172231 +IKRGRZ- __RefHeading___Toc70187_3358172231 +IKRGX __RefHeading___Toc506847_2135714711 +IKRGX- __RefHeading___Toc506847_2135714711 +IKRGY __RefHeading___Toc506847_2135714711 +IKRGY- __RefHeading___Toc506847_2135714711 +IKRGZ __RefHeading___Toc506847_2135714711 +IKRGZ- __RefHeading___Toc506847_2135714711 IKRO __RefHeading___Toc97395_6216624141 IKRORG __RefHeading___Toc70189_3358172231 +IKRORGX __RefHeading___Toc70189_3358172231 +IKRORGX- __RefHeading___Toc70189_3358172231 +IKRORGY __RefHeading___Toc70189_3358172231 +IKRORGY- __RefHeading___Toc70189_3358172231 +IKRORGZ __RefHeading___Toc70189_3358172231 +IKRORGZ- __RefHeading___Toc70189_3358172231 IKRORW __RefHeading___Toc70191_3358172231 +IKRORWX __RefHeading___Toc70191_3358172231 +IKRORWX- __RefHeading___Toc70191_3358172231 +IKRORWY __RefHeading___Toc70191_3358172231 +IKRORWY- __RefHeading___Toc70191_3358172231 +IKRORWZ __RefHeading___Toc70191_3358172231 +IKRORWZ- __RefHeading___Toc70191_3358172231 +IKROX __RefHeading___Toc97395_6216624141 +IKROX- __RefHeading___Toc97395_6216624141 +IKROY __RefHeading___Toc97395_6216624141 +IKROY- __RefHeading___Toc97395_6216624141 +IKROZ __RefHeading___Toc97395_6216624141 +IKROZ- __RefHeading___Toc97395_6216624141 IKRW __RefHeading___Toc97397_6216624141 IKRWR __RefHeading___Toc70193_3358172231 +IKRWRX __RefHeading___Toc70193_3358172231 +IKRWRX- __RefHeading___Toc70193_3358172231 +IKRWRY __RefHeading___Toc70193_3358172231 +IKRWRY- __RefHeading___Toc70193_3358172231 +IKRWRZ __RefHeading___Toc70193_3358172231 +IKRWRZ- __RefHeading___Toc70193_3358172231 +IKRWX __RefHeading___Toc97397_6216624141 +IKRWX- __RefHeading___Toc97397_6216624141 +IKRWY __RefHeading___Toc97397_6216624141 +IKRWY- __RefHeading___Toc97397_6216624141 +IKRWZ __RefHeading___Toc97397_6216624141 +IKRWZ- __RefHeading___Toc97397_6216624141 IMBNUM __RefHeading___Toc129665_83452205 IMBNUMMF __RefHeading___Toc523278_2135714711 +IMBNUMX __RefHeading___Toc129665_83452205 +IMBNUMX- __RefHeading___Toc129665_83452205 +IMBNUMY __RefHeading___Toc129665_83452205 +IMBNUMY- __RefHeading___Toc129665_83452205 +IMBNUMZ __RefHeading___Toc129665_83452205 +IMBNUMZ- __RefHeading___Toc129665_83452205 IMKRVD __RefHeading___Toc106393_3115110868 +IMKRVDX __RefHeading___Toc106393_3115110868 +IMKRVDX- __RefHeading___Toc106393_3115110868 +IMKRVDY __RefHeading___Toc106393_3115110868 +IMKRVDY- __RefHeading___Toc106393_3115110868 +IMKRVDZ __RefHeading___Toc106393_3115110868 +IMKRVDZ- __RefHeading___Toc106393_3115110868 IMPCVD __RefHeading___Toc528742_2135714711 IMPES __RefHeading___Toc53039_4106839650 IMPLICIT __RefHeading___Toc534216_2135714711 IMPORT __RefHeading___Toc561662_2135714711 IMPTVD __RefHeading___Toc70967_621662414 +IMPTVDX __RefHeading___Toc70967_621662414 +IMPTVDX- __RefHeading___Toc70967_621662414 +IMPTVDY __RefHeading___Toc70967_621662414 +IMPTVDY- __RefHeading___Toc70967_621662414 +IMPTVDZ __RefHeading___Toc70967_621662414 +IMPTVDZ- __RefHeading___Toc70967_621662414 IMSPCVD __RefHeading___Toc545192_2135714711 INCLUDE __RefHeading___Toc605478_3199477706 INIT __RefHeading___Toc45789_719036256 @@ -422,27 +494,127 @@ IONXSURF __RefHeading___Toc229181_2369005893 IPCG __RefHeading___Toc77038_621662414 IPCW __RefHeading___Toc84166_621662414 ISGCR __RefHeading___Toc64693_2379415017 +ISGCRX __RefHeading___Toc64693_2379415017 +ISGCRX- __RefHeading___Toc64693_2379415017 +ISGCRY __RefHeading___Toc64693_2379415017 +ISGCRY- __RefHeading___Toc64693_2379415017 +ISGCRZ __RefHeading___Toc64693_2379415017 +ISGCRZ- __RefHeading___Toc64693_2379415017 ISGL __RefHeading___Toc22881_7842323222 ISGLPC __RefHeading___Toc170134_1371377330 +ISGLX __RefHeading___Toc22881_7842323222 +ISGLX- __RefHeading___Toc22881_7842323222 +ISGLY __RefHeading___Toc22881_7842323222 +ISGLY- __RefHeading___Toc22881_7842323222 +ISGLZ __RefHeading___Toc22881_7842323222 +ISGLZ- __RefHeading___Toc22881_7842323222 ISGU __RefHeading___Toc22883_7842323222 +ISGUX __RefHeading___Toc22883_7842323222 +ISGUX- __RefHeading___Toc22883_7842323222 +ISGUY __RefHeading___Toc22883_7842323222 +ISGUY- __RefHeading___Toc22883_7842323222 +ISGUZ __RefHeading___Toc22883_7842323222 +ISGUZ- __RefHeading___Toc22883_7842323222 ISOGCR __RefHeading___Toc30434_7842323221 +ISOGCRX __RefHeading___Toc30434_7842323221 +ISOGCRX- __RefHeading___Toc30434_7842323221 +ISOGCRY __RefHeading___Toc30434_7842323221 +ISOGCRY- __RefHeading___Toc30434_7842323221 +ISOGCRZ __RefHeading___Toc30434_7842323221 +ISOGCRZ- __RefHeading___Toc30434_7842323221 ISOLNUM __RefHeading___Toc69656_3218818441 ISOWCR __RefHeading___Toc30436_7842323221 +ISOWCRX __RefHeading___Toc30436_7842323221 +ISOWCRX- __RefHeading___Toc30436_7842323221 +ISOWCRY __RefHeading___Toc30436_7842323221 +ISOWCRY- __RefHeading___Toc30436_7842323221 +ISOWCRZ __RefHeading___Toc30436_7842323221 +ISOWCRZ- __RefHeading___Toc30436_7842323221 ISWCR __RefHeading___Toc27248_7842323221 +ISWCRX __RefHeading___Toc27248_7842323221 +ISWCRX- __RefHeading___Toc27248_7842323221 +ISWCRY __RefHeading___Toc27248_7842323221 +ISWCRY- __RefHeading___Toc27248_7842323221 +ISWCRZ __RefHeading___Toc27248_7842323221 +ISWCRZ- __RefHeading___Toc27248_7842323221 ISWL __RefHeading___Toc22881_78423232211 ISWLPC __RefHeading___Toc176392_1371377330 +ISWLX __RefHeading___Toc22881_78423232211 +ISWLX- __RefHeading___Toc22881_78423232211 +ISWLY __RefHeading___Toc22881_78423232211 +ISWLY- __RefHeading___Toc22881_78423232211 +ISWLZ __RefHeading___Toc22881_78423232211 +ISWLZ- __RefHeading___Toc22881_78423232211 ISWU __RefHeading___Toc22883_78423232211 +ISWUX __RefHeading___Toc22883_78423232211 +ISWUX- __RefHeading___Toc22883_78423232211 +ISWUY __RefHeading___Toc22883_78423232211 +ISWUY- __RefHeading___Toc22883_78423232211 +ISWUZ __RefHeading___Toc22883_78423232211 +ISWUZ- __RefHeading___Toc22883_78423232211 JFUNC __RefHeading___Toc86297_3218818441 JFUNCR __RefHeading___Toc257142_2369005893 KRG __RefHeading___Toc97393_621662414 KRGR __RefHeading___Toc70187_335817223 +KRGRX __RefHeading___Toc70187_335817223 +KRGRX- __RefHeading___Toc70187_335817223 +KRGRY __RefHeading___Toc70187_335817223 +KRGRY- __RefHeading___Toc70187_335817223 +KRGRZ __RefHeading___Toc70187_335817223 +KRGRZ- __RefHeading___Toc70187_335817223 +KRGX __RefHeading___Toc97393_621662414 +KRGX- __RefHeading___Toc97393_621662414 +KRGY __RefHeading___Toc97393_621662414 +KRGY- __RefHeading___Toc97393_621662414 +KRGZ __RefHeading___Toc97393_621662414 +KRGZ- __RefHeading___Toc97393_621662414 KRNUM __RefHeading___Toc273792_2369005893 KRNUMMF __RefHeading___Toc279351_2369005893 +KRNUMR __RefHeading___Toc273792_2369005893 +KRNUMR- __RefHeading___Toc273792_2369005893 +KRNUMT __RefHeading___Toc273792_2369005893 +KRNUMT- __RefHeading___Toc273792_2369005893 +KRNUMX __RefHeading___Toc273792_2369005893 +KRNUMX- __RefHeading___Toc273792_2369005893 +KRNUMY __RefHeading___Toc273792_2369005893 +KRNUMY- __RefHeading___Toc273792_2369005893 +KRNUMZ __RefHeading___Toc273792_2369005893 +KRNUMZ- __RefHeading___Toc273792_2369005893 KRO __RefHeading___Toc97395_621662414 KRORG __RefHeading___Toc70189_335817223 +KRORGX __RefHeading___Toc70189_335817223 +KRORGX- __RefHeading___Toc70189_335817223 +KRORGY __RefHeading___Toc70189_335817223 +KRORGY- __RefHeading___Toc70189_335817223 +KRORGZ __RefHeading___Toc70189_335817223 +KRORGZ- __RefHeading___Toc70189_335817223 KRORW __RefHeading___Toc70191_335817223 +KRORWX __RefHeading___Toc70191_335817223 +KRORWX- __RefHeading___Toc70191_335817223 +KRORWY __RefHeading___Toc70191_335817223 +KRORWY- __RefHeading___Toc70191_335817223 +KRORWZ __RefHeading___Toc70191_335817223 +KRORWZ- __RefHeading___Toc70191_335817223 +KROX __RefHeading___Toc97395_621662414 +KROX- __RefHeading___Toc97395_621662414 +KROY __RefHeading___Toc97395_621662414 +KROY- __RefHeading___Toc97395_621662414 +KROZ __RefHeading___Toc97395_621662414 +KROZ- __RefHeading___Toc97395_621662414 KRW __RefHeading___Toc97397_621662414 KRWR __RefHeading___Toc70193_335817223 +KRWRX __RefHeading___Toc70193_335817223 +KRWRX- __RefHeading___Toc70193_335817223 +KRWRY __RefHeading___Toc70193_335817223 +KRWRY- __RefHeading___Toc70193_335817223 +KRWRZ __RefHeading___Toc70193_335817223 +KRWRZ- __RefHeading___Toc70193_335817223 +KRWX __RefHeading___Toc97397_621662414 +KRWX- __RefHeading___Toc97397_621662414 +KRWY __RefHeading___Toc97397_621662414 +KRWY- __RefHeading___Toc97397_621662414 +KRWZ __RefHeading___Toc97397_621662414 +KRWZ- __RefHeading___Toc97397_621662414 LAB __RefHeading___Toc72458_2267116897 LANGMPL __RefHeading___Toc208762_2843394514 LANGMUIR __RefHeading___Toc214338_2843394514 @@ -528,7 +700,7 @@ MSFN __RefHeading___Toc109745_335817223 MSGFILE __RefHeading___Toc66023_4106839650 MULSGGD __RefHeading___Toc598412_3181922006 MULSGGDV __RefHeading___Toc598414_3181922006 -MULT __RefHeading___Toc195177_1371377330 +MULTIN __RefHeading___Toc195177_1371377330 MULTFLT __RefHeading___Toc99835_2023071304 MULTIPLY __RefHeading___Toc99909_2023071304 MULTIREG __RefHeading___Toc99911_2023071304 @@ -694,7 +866,7 @@ PVCO __RefHeading___Toc325700_501926209 PVDG __RefHeading___Toc104056_57619843 PVDO __RefHeading___Toc45803_719036256 PVDS __RefHeading___Toc104058_57619843 -PVSOL __RefHeading___Toc414279_1093985484 +PVTSOL __RefHeading___Toc414279_1093985484 PVTG __RefHeading___Toc104060_57619843 PVTGW __RefHeading___Toc355649_3149455253 PVTGWO __RefHeading___Toc356776_4176551521 @@ -810,14 +982,32 @@ SEPVALS __RefHeading___Toc663272_516898843 SFOAM __RefHeading___Toc669649_516898843 SGAS __RefHeading___Toc137369_1317547213 SGCR __RefHeading___Toc20428_784232322 +SGCRX __RefHeading___Toc20428_784232322 +SGCRX- __RefHeading___Toc20428_784232322 +SGCRY __RefHeading___Toc20428_784232322 +SGCRY- __RefHeading___Toc20428_784232322 +SGCRZ __RefHeading___Toc20428_784232322 +SGCRZ- __RefHeading___Toc20428_784232322 SGCWMIS __RefHeading___Toc121473_83452205 SGF32D __RefHeading___Toc676035_516898843 SGFN __RefHeading___Toc106868_335817223 SGL __RefHeading___Toc22881_784232322 SGLPC __RefHeading___Toc170136_1371377330 +SGLX __RefHeading___Toc22881_784232322 +SGLX- __RefHeading___Toc22881_784232322 +SGLY __RefHeading___Toc22881_784232322 +SGLY- __RefHeading___Toc22881_784232322 +SGLZ __RefHeading___Toc22881_784232322 +SGLZ- __RefHeading___Toc22881_784232322 SGOF __RefHeading___Toc106870_335817223 SGOFLET __RefHeading___Toc398952_3017686537 SGU __RefHeading___Toc22883_784232322 +SGUX __RefHeading___Toc22883_784232322 +SGUX- __RefHeading___Toc22883_784232322 +SGUY __RefHeading___Toc22883_784232322 +SGUY- __RefHeading___Toc22883_784232322 +SGUZ __RefHeading___Toc22883_784232322 +SGUZ- __RefHeading___Toc22883_784232322 SGWFLET __RefHeading___Toc548127_947687768 SGWFN __RefHeading___Toc106872_335817223 SHRATE __RefHeading___Toc121475_83452205 @@ -849,6 +1039,12 @@ SOF2 __RefHeading___Toc106876_335817223 SOF3 __RefHeading___Toc106878_335817223 SOF32D __RefHeading___Toc765497_4250154414 SOGCR __RefHeading___Toc30434_784232322 +SOGCRX __RefHeading___Toc30434_784232322 +SOGCRX- __RefHeading___Toc30434_784232322 +SOGCRY __RefHeading___Toc30434_784232322 +SOGCRY- __RefHeading___Toc30434_784232322 +SOGCRZ __RefHeading___Toc30434_784232322 +SOGCRZ- __RefHeading___Toc30434_784232322 SOIL __RefHeading___Toc137371_1317547213 SOLUTION __RefHeading___Toc43947_784232322 SOLVCONC __RefHeading___Toc771984_4250154414 @@ -862,6 +1058,12 @@ SOMWAT __RefHeading___Toc798238_4250154414 SORWMIS __RefHeading___Toc121477_83452205 SOURCE REF_HEADING_KEYWORD_SOURCE_12_3 SOWCR __RefHeading___Toc30436_784232322 +SOWCRX __RefHeading___Toc30436_784232322 +SOWCRX- __RefHeading___Toc30436_784232322 +SOWCRY __RefHeading___Toc30436_784232322 +SOWCRY- __RefHeading___Toc30436_784232322 +SOWCRZ __RefHeading___Toc30436_784232322 +SOWCRZ- __RefHeading___Toc30436_784232322 SOXYG __RefHeading___Toc439460_111689907 SPECGRID- __RefHeading___Toc45797_7190362561 SPECHEAT __RefHeading___Toc121479_83452205 @@ -905,14 +1107,32 @@ SURFWNUM __RefHeading___Toc1179776_4250154414 SWAT __RefHeading___Toc137373_1317547213 SWATINIT __RefHeading___Toc323952_1728001293 SWCR __RefHeading___Toc27248_784232322 +SWCRX __RefHeading___Toc27248_784232322 +SWCRX- __RefHeading___Toc27248_784232322 +SWCRY __RefHeading___Toc27248_784232322 +SWCRY- __RefHeading___Toc27248_784232322 +SWCRZ __RefHeading___Toc27248_784232322 +SWCRZ- __RefHeading___Toc27248_784232322 SWF32D __RefHeading___Toc1186403_4250154414 SWFN __RefHeading___Toc106882_335817223 SWINGFAC __RefHeading___Toc1193036_4250154414 SWL __RefHeading___Toc22881_7842323221 SWLPC __RefHeading___Toc179514_1371377330 +SWLX __RefHeading___Toc22881_7842323221 +SWLX- __RefHeading___Toc22881_7842323221 +SWLY __RefHeading___Toc22881_7842323221 +SWLY- __RefHeading___Toc22881_7842323221 +SWLZ __RefHeading___Toc22881_7842323221 +SWLZ- __RefHeading___Toc22881_7842323221 SWOF __RefHeading___Toc45811_7190362561 SWOFLET __RefHeading___Toc398954_3017686537 SWU __RefHeading___Toc22883_7842323221 +SWUX __RefHeading___Toc22883_7842323221 +SWUX- __RefHeading___Toc22883_7842323221 +SWUY __RefHeading___Toc22883_7842323221 +SWUY- __RefHeading___Toc22883_7842323221 +SWUZ __RefHeading___Toc22883_7842323221 +SWUZ- __RefHeading___Toc22883_7842323221 TABDIMS __RefHeading___Toc89327_327352552 TBLK __RefHeading___Toc198434_3325167686 TCRIT REF_HEADING_KEYWORD_TCRIT @@ -970,9 +1190,9 @@ TUNINGS __RefHeading___Toc1647719_4250154414 TVDP __RefHeading___Toc210170_2884651453 TZONE __RefHeading___Toc1668114_4250154414 UDADIMS __RefHeading___Toc65914_1778172979 -UDQPARAM __RefHeading___Toc161093_2932703077 UDQ __RefHeading___Toc161095_2932703077 UDQDIMS __RefHeading___Toc65916_1778172979 +UDQPARAM __RefHeading___Toc161093_2932703077 UDT __RefHeading___Toc1674916_4250154414 UDTDIMS __RefHeading___Toc1681728_4250154414 UNCODHMD __RefHeading___Toc1688602_4250154414 diff --git a/scripts/python/pyproject.toml b/scripts/python/pyproject.toml index d3b21e3a..07c75fd9 100644 --- a/scripts/python/pyproject.toml +++ b/scripts/python/pyproject.toml @@ -28,7 +28,8 @@ fodt-extract-xml-tag = "fodt.splitter:extract_xml_tag" fodt-fix-ignored-keywords = "fodt.fix_ignored:fix_ignored" fodt-fix-footer-style = "fodt.fix_footer_style:fix_footer_style" fodt-fix-letter-k-footer = "fodt.fix_letter_k_footer:fix_letter_k_footer" -fodt-gen-kw-uri-map = "fodt.keyword_linker:gen_kw_uri_map" +fodt-gen-kw-uri-map = "fodt.keyword_uri_map_generator:gen_kw_uri_map_cli" +fodt-link-keywords = "fodt.keyword_linker:link_keywords" fodt-remove-bookmarks-from-master-styles = "fodt.remove_bookmarks:remove_bookmarks_from_master_styles" fodt-remove-chapters = "fodt.splitter:remove_chapters" fodt-remove-elements = "fodt.splitter:remove_elements" diff --git a/scripts/python/src/fodt/keyword_linker.py b/scripts/python/src/fodt/keyword_linker.py index 97c8d44a..bb54ef3a 100644 --- a/scripts/python/src/fodt/keyword_linker.py +++ b/scripts/python/src/fodt/keyword_linker.py @@ -1,3 +1,4 @@ +import io import logging import re import xml.sax @@ -11,53 +12,108 @@ from fodt.constants import ClickOptions, Directories, FileNames, FileExtensions from fodt.exceptions import HandlerDoneException, ParsingException -from fodt import helpers +from fodt import helpers, keyword_uri_map_generator from fodt.xml_helpers import XMLHelper -class ExtractURI_Handler(xml.sax.handler.ContentHandler): - def __init__(self, keyword_name: str) -> None: +class FileHandler(xml.sax.handler.ContentHandler): + def __init__(self, keyword_name: str, kw_uri_map: dict[str, str]) -> None: self.keyword_name = keyword_name + self.kw_uri_map = kw_uri_map self.in_section = False - self.in_bookmark = False - self.in_bookmark2 = False - self.uri = "" + # For empty tags, we use a special trick to rewrite them with a shortened + # end /> tag instead of the full end tag + self.start_tag_open = False + self.in_p = False + self.is_example_p = [] # Stack of boolean values: If current p tag is an example + self.p_recursion = 0 # We can have nested p tags + self.in_a = False + self.content = io.StringIO() + # Create a regex pattern with alternation on the keyword names + self.regex = self.compile_regex() + self.num_links_inserted = 0 + self.office_body_found = False + # Set of paragraph styles using fixed width fonts, intialized with the + # "_40_Example" style that is used indirectly by the other example styles + self.example_styles = {'_40_Example'} + # Special span style that have been manually inserted to indicate that + # a word should not be treated as a keyword and therefore should not be linked + self.not_keyword = False + + def compile_regex(self) -> re.Pattern: + # Also include the keyword name itself in the regex pattern, see discussion + # https://github.com/OPM/opm-reference-manual/pull/410 + pattern = re.compile( + r'(?CO2STORE - # we cannot expect to find the keyword name in the content. Instead we print a warning and - # the user can manually check the warning message to see if the keyword name was found. - logging.warning(f"Keyword name {self.keyword_name} not found in bookmark content: {content}") - raise HandlerDoneException("Done parsing.") + # NOTE: characters() is only called if there is content between the start + # tag and the end tag. If there is no content, characters() is not called. + if self.start_tag_open: + self.content.write(">") + self.start_tag_open = False + # NOTE: We need to escape the content before we apply the regex pattern + # because it may insert tags () that should not be escaped. + content = XMLHelper.escape(content) + if self.office_body_found: + if self.in_p and (not self.in_a) and (not self.not_keyword): + if not self.is_example_p[-1]: + if not self.is_table_caption(content): + content = self.regex.sub(self.replace_match_function, content) + self.content.write(content) + + def collect_style(self, attrs: xml.sax.xmlreader.AttributesImpl) -> None: + # Collect the paragraph styles that use fixed width fonts + if "style:name" in attrs.getNames(): + style_name = attrs.getValue("style:name") + self.example_styles.add(style_name) def endDocument(self): - raise ParsingException("Keyword name not found in document") + pass def endElement(self, name: str): - if name == "text:h": - if self.in_section: - self.in_section = False - # The keyword URI must be found within the text:h, since we are done parsing - # the tag, we raise an exception here to catch an unexpected situation. - raise ParsingException("Keyword name not found in document") - elif self.in_section and name == "text:bookmark-start": - if self.in_bookmark: - self.in_bookmark2 = True # In the middle of a bookmark between start and end - elif self.in_section and name == "text:bookmark-end": - if self.in_bookmark: - self.in_bookmark = False - raise ParsingException("Keyword name not found in document") - - def get_extracted_uri(self) -> str: - return self.uri + if self.office_body_found: + if name == "text:p": + self.p_recursion -= 1 + if self.p_recursion == 0: + self.in_p = False + self.is_example_p.pop() + elif name == "text:a": + self.in_a = False + elif name == "text:span": + self.not_keyword = False # This cannot be nested + if self.start_tag_open: + self.content.write("/>") + self.start_tag_open = False + else: + self.content.write(XMLHelper.endtag(name)) + + def get_content(self) -> str: + return self.content.getvalue() + + def get_num_links_inserted(self) -> int: + return self.num_links_inserted + + def is_table_caption(self, content: str) -> bool: + # Check if the content is a specific table caption, in that case we should not insert links + return re.search(rf'{re.escape(self.keyword_name)} Keyword Description', content) + + def replace_match_function(self, match: re.Match) -> str: + keyword = match.group(0) + uri = self.kw_uri_map[keyword] + self.num_links_inserted += 1 + return f'{keyword}' # This callback is used for debugging, it can be used to print # line numbers in the XML file @@ -65,99 +121,152 @@ def setDocumentLocator(self, locator): self.locator = locator def startDocument(self): - pass + self.content.write(XMLHelper.header) def startElement(self, name:str, attrs: xml.sax.xmlreader.AttributesImpl): - if name == "text:h": - if "text:outline-level" in attrs.getNames(): - level = attrs.getValue("text:outline-level") - if level == "3": - self.in_section = True - elif self.in_section and name == "text:bookmark-start": - self.in_bookmark = True - # Assume there always will be a text:bookmark-start tag immediately - # following the text:h tag. This element will contain the keyword name. - self.uri = attrs.getValue("text:name") - - -class ExtractURI: - def __init__(self, file: Path, keyword_name: str) -> None: - self.filename = file - self.keyword_name = keyword_name + if self.start_tag_open: + self.content.write(">") # Close the start tag + self.start_tag_open = False + if not self.office_body_found: + if name == "office:body": + self.office_body_found = True + else: + if name == "style:style": + if "style:parent-style-name" in attrs.getNames(): + if attrs.getValue("style:parent-style-name") == "_40_Example": + self.collect_style(attrs) + else: + if name == "text:p": + self.in_p = True + self.p_recursion += 1 + self.update_example_stack(attrs) + elif name == "text:a": + # We are inside an anchor, and we should not insert another text:a tag here + self.in_a = True + elif name == "text:span": + if "text:style-name" in attrs.getNames(): + style_name = attrs.getValue("text:style-name") + if style_name == "NOT_KEYWORD": + self.not_keyword = True + self.start_tag_open = True + self.content.write(XMLHelper.starttag(name, attrs, close_tag=False)) + + def update_example_stack(self, attrs: xml.sax.xmlreader.AttributesImpl) -> None: + if "text:style-name" in attrs.getNames(): + style_name = attrs.getValue("text:style-name") + self.is_example_p.append(style_name in self.example_styles) + else: + self.is_example_p.append(False) + +class InsertLinks(): + def __init__( + self, maindir: Path, subsection: str, kw_dir: Path, kw_uri_map: dict[str, str] + ) -> None: + self.maindir = maindir + self.kw_dir = kw_dir + self.kw_uri_map = kw_uri_map + self.subsection = subsection + + def insert_links(self) -> None: + for item in self.kw_dir.iterdir(): + if not item.is_dir(): + continue + if self.subsection: + if item.name != self.subsection: + logging.info(f"Skipping directory: {item}") + continue + logging.info(f"Processing directory: {item}") + for item2 in item.iterdir(): + if item2.suffix == f".{FileExtensions.fodt}": + keyword_name = item2.name.removesuffix(f".{FileExtensions.fodt}") + self.insert_links_in_file(item2, keyword_name) - def extract(self) -> str: + def insert_links_in_file(self, filename: Path, keyword_name: str) -> None: parser = xml.sax.make_parser() - handler = ExtractURI_Handler(self.keyword_name) + handler = FileHandler(keyword_name, self.kw_uri_map) parser.setContentHandler(handler) try: - parser.parse(str(self.filename)) + parser.parse(str(filename)) except HandlerDoneException as e: pass - uri = handler.get_extracted_uri() - return uri - -class ProcessChapter: - def __init__(self, maindir: Path, chapter: str, section: str, kw_file: Path, kw_uri_map: dict[str, str]) -> None: - self.chapter = chapter - self.section = section - self.kw_file = kw_file - self.kw_uri_map = kw_uri_map - self.maindir = maindir + num_links_inserted = handler.get_num_links_inserted() + if num_links_inserted > 0: + with open(filename, "w", encoding='utf8') as f: + f.write(handler.content.getvalue()) + logging.info(f"{filename.name}: Inserted {num_links_inserted} links.") + else: + logging.info(f"{filename.name}: No links inserted.") + + +def load_kw_uri_map(maindir: Path) -> dict[str, str]: + kw_uri_map_path = maindir / Directories.meta / FileNames.kw_uri_map + if not kw_uri_map_path.exists(): + raise FileNotFoundError(f"File not found: {kw_uri_map_path}") + kw_uri_map = {} + with open(kw_uri_map_path, "r", encoding='utf-8') as f: + for line in f: + # Each line is on the format " " where is the keyword name and + # does not contain any whitespace characters, and is the URI of the + # keyword subsection subdocument. The may contain whitespace characters. + # There is a single whitespace character between and . + match = re.match(r"(\S+)\s+(.+)", line) + if match: + parts = match.groups() + kw_uri_map[parts[0]] = parts[1] + else: + raise ParsingException(f"Could not parse line: {line}") + return kw_uri_map - def process(self) -> None: - with open(self.kw_file, "r", encoding='utf8') as f: - for line in f: - keyword = line.strip() - uri = self.keyword_uri(keyword) - self.kw_uri_map[keyword] = uri - - def keyword_uri(self, keyword: str) -> str: - kw_file = (self.maindir / Directories.chapters / Directories.subsections / - f"{self.chapter}.{self.section}" / f"{keyword}.{FileExtensions.fodt}") - uri = ExtractURI(kw_file, keyword).extract() - return uri - -# fodt-gen-kw-uri-map -# ------------------- +# fodt-link-keywords +# ------------------ # # SHELL USAGE: # -# fodt-gen-kw-uri-map --maindir= --keyword_dir= +# fodt-link-keyword \ +# --maindir= --keyword_dir= --subsection= --use-map-file # # DESCRIPTION: # -# Generates a map: KW_NAME -> URI for all keywords. The map is saved to the file -# "meta/kw_uri_map.txt" in the main directory. +# Links all keyword names found inside

tags in the subsection documents to the +# corresponding keyword subsection subdocument. +# +# If the option --use-map-file is given, the script will use the mapping file +# "meta/kw_uri_map.txt" (generated by running the script "fodt-gen-kw-uri-map"), else +# it will generate the mapping on the fly. The mapping is a map from keyword name to +# the URI of the keyword subsection subdocument. This map is needed to generate the +# links. # -# EXAMPLE: +# If --subsection is not given, the script will process all subsections. If --subsection +# is given, the script will only process the specified subsection. # -# fodt-gen-kw-uri-map +# EXAMPLES: # -# Will use the default values: --maindir=../../parts --keyword_dir=../../keyword-names +# fodt-link-keywords --subsection=5.3 +# +# Will use the default values: --maindir=../../parts, --keyword_dir=../../keyword-names, +# and will process only the keywords in subsection 5.3, and will generate the mapping on the fly. +# +# fodt-link-keywords +# +# Same as above, but will process all subsections. # @click.command() @ClickOptions.maindir() @ClickOptions.keyword_dir -def gen_kw_uri_map(maindir: str|None, keyword_dir: str|None) -> None: +@click.option('--subsection', help='The subsection to process') +@click.option('--use-map-file', is_flag=True, help='Use the mapping file "meta/kw_uri_map.txt"') +def link_keywords( + maindir: str|None, keyword_dir: str|None, subsection: str|None, use_map_file: bool +) -> None: logging.basicConfig(level=logging.INFO) - keyword_dir = helpers.get_keyword_dir(keyword_dir) maindir = helpers.get_maindir(maindir) - kw_uri_map = {} - # Assume all directories in keyword_dir are keyword directories on the form xx.yy - # where xx is the chapter number and yy is the section number. - for item1 in keyword_dir.iterdir(): - if not item1.is_dir(): - continue - chapter_str = item1.name - (chapter, section) = chapter_str.split(".") - kw_file = item1 / FileNames.keywords - logging.info(f"Processing chapter {chapter_str}") - ProcessChapter(maindir, chapter, section, kw_file, kw_uri_map).process() - - with open(maindir / Directories.meta / FileNames.kw_uri_map, "w", encoding='utf8') as f: - for kw in sorted(kw_uri_map.keys()): - f.write(f"{kw} {kw_uri_map[kw]}\n") - logging.info(f"Generated keyword URI map to {maindir / Directories.meta / FileNames.kw_uri_map}") + keyword_dir = helpers.get_keyword_dir(keyword_dir) + if use_map_file: + kw_uri_map = load_kw_uri_map(maindir) + else: + kw_uri_map = keyword_uri_map_generator.get_kw_uri_map(maindir, keyword_dir) + kw_dir = maindir / Directories.chapters / Directories.subsections + InsertLinks(maindir, subsection, kw_dir, kw_uri_map).insert_links() if __name__ == "__main__": - gen_kw_uri_map() + link_keywords() diff --git a/scripts/python/src/fodt/keyword_uri_map_generator.py b/scripts/python/src/fodt/keyword_uri_map_generator.py new file mode 100644 index 00000000..7b885b8f --- /dev/null +++ b/scripts/python/src/fodt/keyword_uri_map_generator.py @@ -0,0 +1,190 @@ +import logging +import re +import xml.sax +import xml.sax.handler +import xml.sax.xmlreader +import xml.sax.saxutils + +from pathlib import Path + +import click + +from fodt.constants import ClickOptions, Directories, FileNames, FileExtensions +from fodt.exceptions import HandlerDoneException, ParsingException +from fodt import helpers +from fodt.xml_helpers import XMLHelper + +class ExtractURI_Handler(xml.sax.handler.ContentHandler): + def __init__(self, keyword_name: str) -> None: + self.keyword_name = keyword_name + self.in_section = False + self.in_bookmark = False + self.in_bookmark2 = False + self.uri = "" + + def characters(self, content: str): + if self.in_bookmark2: + # Check if the content matches the keyword name + match = re.search(self.keyword_name, content) + if match is None: + #self.uri = None + #raise ParsingException( + # f"Keyword name {self.keyword_name} not found in bookmark content: {content}" + #) + # NOTE: Since the content may contain span tags, for example the CO2STORE keyword goes like this + # CO2STORE + # we cannot expect to find the keyword name in the content. Instead we print a warning and + # the user can manually check the warning message to see if the keyword name was found. + logging.warning(f"Keyword name {self.keyword_name} not found in bookmark content: {content}") + raise HandlerDoneException("Done parsing.") + + def endDocument(self): + raise ParsingException("Keyword name not found in document") + + def endElement(self, name: str): + if name == "text:h": + if self.in_section: + self.in_section = False + # The keyword URI must be found within the text:h, since we are done parsing + # the tag, we raise an exception here to catch an unexpected situation. + raise ParsingException("Keyword name not found in document") + elif self.in_section and name == "text:bookmark-start": + if self.in_bookmark: + self.in_bookmark2 = True # In the middle of a bookmark between start and end + elif self.in_section and name == "text:bookmark-end": + if self.in_bookmark: + self.in_bookmark = False + raise ParsingException("Keyword name not found in document") + + def get_extracted_uri(self) -> str: + return self.uri + + # This callback is used for debugging, it can be used to print + # line numbers in the XML file + def setDocumentLocator(self, locator): + self.locator = locator + + def startDocument(self): + pass + + def startElement(self, name:str, attrs: xml.sax.xmlreader.AttributesImpl): + if name == "text:h": + if "text:outline-level" in attrs.getNames(): + level = attrs.getValue("text:outline-level") + if level == "3": + self.in_section = True + elif self.in_section and name == "text:bookmark-start": + self.in_bookmark = True + # Assume there always will be a text:bookmark-start tag immediately + # following the text:h tag. This element will contain the keyword name. + self.uri = attrs.getValue("text:name") + + +class ExtractURI: + def __init__(self, file: Path, keyword_name: str) -> None: + self.filename = file + self.keyword_name = keyword_name + + def extract(self) -> str: + parser = xml.sax.make_parser() + handler = ExtractURI_Handler(self.keyword_name) + parser.setContentHandler(handler) + try: + parser.parse(str(self.filename)) + except HandlerDoneException as e: + pass + uri = handler.get_extracted_uri() + return uri + +class ProcessChapter: + def __init__(self, maindir: Path, chapter: str, section: str, kw_file: Path, kw_uri_map: dict[str, str]) -> None: + self.chapter = chapter + self.section = section + self.kw_file = kw_file + self.kw_uri_map = kw_uri_map + self.maindir = maindir + + def process(self) -> None: + with open(self.kw_file, "r", encoding='utf8') as f: + for line in f: + keyword = line.strip() + uri = self.keyword_uri(keyword) + self.kw_uri_map[keyword] = uri + + def keyword_uri(self, keyword: str) -> str: + kw_file = (self.maindir / Directories.chapters / Directories.subsections / + f"{self.chapter}.{self.section}" / f"{keyword}.{FileExtensions.fodt}") + uri = ExtractURI(kw_file, keyword).extract() + return uri + +def get_kw_uri_map(maindir: Path, keyword_dir: Path) -> dict[str, str]: + kw_uri_map = {} + # Assume all directories in keyword_dir are keyword directories on the form xx.yy + # where xx is the chapter number and yy is the section number. + for item1 in keyword_dir.iterdir(): + if not item1.is_dir(): + continue + chapter_str = item1.name + (chapter, section) = chapter_str.split(".") + kw_file = item1 / FileNames.keywords + logging.info(f"Processing chapter {chapter_str}") + ProcessChapter(maindir, chapter, section, kw_file, kw_uri_map).process() + add_keyword_aliases(kw_uri_map) + return kw_uri_map + +def add_keyword_aliases(kw_uri_map: dict[str, str]) -> None: + # Add aliases for keywords + for keyword in ["ENKRVD", "ENPTVD", "IKRG", "IKRGR", "IKRO", "IKRORG", "IKRORW", + "IKRW", "IKRWR", "IMBNUM", "IMKRVD", "IMPTVD", "ISGCR", "ISGL", + "ISGU", "ISOGCR", "ISOWCR", "ISWCR", "ISWL", "ISWU", + "KRG", "KRGR", "KRNUM", "KRO", "KRORG", "KRORW", "KRW", "KRWR", + "SGCR", "SGL", "SGU", "SOGCR", "SOWCR", "SWCR", "SWL", "SWU"]: + add_xyz_aliases(kw_uri_map, keyword) + for keyword in ["KRNUM"]: + add_rt_aliases(kw_uri_map, keyword) + +def add_xyz_aliases(kw_uri_map: dict[str, str], keyword: str) -> None: + for extension in ["X", "Y", "Z", "X-", "Y-", "Z-"]: + add_alias(kw_uri_map, keyword, f"{keyword}{extension}") + +def add_rt_aliases(kw_uri_map: dict[str, str], keyword: str) -> None: + for extension in ["R", "T", "R-", "T-"]: + add_alias(kw_uri_map, keyword, f"{keyword}{extension}") + +def add_alias(kw_uri_map: dict[str, str], keyword: str, alias: str) -> None: + uri = kw_uri_map[keyword] + kw_uri_map[alias] = uri + +# fodt-gen-kw-uri-map +# ------------------- +# +# SHELL USAGE: +# +# fodt-gen-kw-uri-map --maindir= --keyword_dir= +# +# DESCRIPTION: +# +# Generates a map: KW_NAME -> URI for all keywords. The map is saved to the file +# "meta/kw_uri_map.txt" in the main directory. +# +# EXAMPLE: +# +# fodt-gen-kw-uri-map +# +# Will use the default values: --maindir=../../parts --keyword_dir=../../keyword-names +# +@click.command() +@ClickOptions.maindir() +@ClickOptions.keyword_dir +def gen_kw_uri_map_cli(maindir: str|None, keyword_dir: str|None) -> None: + logging.basicConfig(level=logging.INFO) + keyword_dir = helpers.get_keyword_dir(keyword_dir) + maindir = helpers.get_maindir(maindir) + kw_uri_map = get_kw_uri_map(maindir, keyword_dir) + with open(maindir / Directories.meta / FileNames.kw_uri_map, "w", encoding='utf8') as f: + for kw in sorted(kw_uri_map.keys()): + f.write(f"{kw} {kw_uri_map[kw]}\n") + logging.info(f"Generated keyword URI map to {maindir / Directories.meta / FileNames.kw_uri_map}") + +if __name__ == "__main__": + gen_kw_uri_map_cli()