diff --git a/deps.edn b/deps.edn index 54044b1..9194bbf 100644 --- a/deps.edn +++ b/deps.edn @@ -14,7 +14,12 @@ {org.apache.logging.log4j/log4j-api {:mvn/version "2.11.0"} org.apache.logging.log4j/log4j-core {:mvn/version "2.11.0"} org.apache.logging.log4j/log4j-slf4j-impl {:mvn/version "2.11.0"}}} - :dev {} + :dev { + :extra-deps {com.clojure-goes-fast/clj-async-profiler {:mvn/version "0.5.0"}} + + :jvm-opts ["-Djdk.attach.allowAttachSelf" ;; for jdk9+ + ] + } :test {:extra-deps {lambdaisland/kaocha {:mvn/version "0.0-573"} diff --git a/src/csv2rdf/csvw.clj b/src/csv2rdf/csvw.clj index a9de919..500cb5c 100644 --- a/src/csv2rdf/csvw.clj +++ b/src/csv2rdf/csvw.clj @@ -14,9 +14,9 @@ (:import [org.eclipse.rdf4j.rio RDFFormat])) (defn- get-table-statements [context {:keys [url dialect] :as table} table-group-dialect] - (let [dialect (or dialect table-group-dialect)] - (let [annotated-rows (csv/annotated-rows url table dialect)] - (table-statements context table annotated-rows)))) + (let [dialect (or dialect table-group-dialect) + annotated-rows (csv/annotated-rows url table dialect)] + (table-statements context table annotated-rows))) (defn csv->rdf "Runs the CSVW process for the given tabular or metadata data sources and options. If metadata-source diff --git a/src/csv2rdf/csvw/common.clj b/src/csv2rdf/csvw/common.clj index 7343a5b..2100682 100644 --- a/src/csv2rdf/csvw/common.clj +++ b/src/csv2rdf/csvw/common.clj @@ -8,10 +8,12 @@ (:import [java.net URI] [org.eclipse.rdf4j.model.impl URIImpl])) +(def bnode-id-counter (atom 0)) + (defn gen-blank-node "Generates a grafter representation of a new blank node" ([] (gen-blank-node "bnode")) - ([prefix] (gproto/make-blank-node (str prefix (gensym))))) + ([prefix] (gproto/make-blank-node (str prefix "__" (swap! bnode-id-counter inc))))) (defn row-unsuppressed-cells "Gets the all the cells within a row whose column output is not suppressed" diff --git a/src/csv2rdf/main.clj b/src/csv2rdf/main.clj index 9785df3..01dab8c 100644 --- a/src/csv2rdf/main.clj +++ b/src/csv2rdf/main.clj @@ -67,20 +67,40 @@ (println "Usage:") (println summary))) +(defn- inner-main [args] + (let [options (parse-cli-options args) + {:keys [mode tabular user-metadata output-file]} options + opts {:tabular-source (some-> tabular parse-source) + :metadata-source (some-> user-metadata parse-source) + :rdf-format (or (some-> output-file formats/->rdf-format) RDFFormat/TURTLE) + :mode mode} + output-file (some-> output-file io/file)] + (if output-file + (with-open [w (io/writer output-file)] + (write-output w opts)) + (write-output (io/writer *out*) opts)))) + (defn- -main [& args] (try - (let [options (parse-cli-options args) - {:keys [mode tabular user-metadata output-file]} options - opts {:tabular-source (some-> tabular parse-source) - :metadata-source (some-> user-metadata parse-source) - :rdf-format (or (some-> output-file formats/->rdf-format) RDFFormat/TURTLE) - :mode mode} - output-file (some-> output-file io/file)] - (if output-file - (with-open [w (io/writer output-file)] - (write-output w opts)) - (write-output (io/writer *out*) opts)) - (System/exit 0)) + (inner-main args) + (System/exit 0) (catch Throwable ex (display-error ex) (System/exit 1)))) + + +(comment + + (time (inner-main ["-t" "out/hmrc-rts-small-area.csv" "-u" "out/hmrc-rts-small-area.csv-metadata.json" "-m" "annotated" "-o" "cube.nt"])) + + (require '[clj-async-profiler.core :as prof]) + +;; Profile the following expression: + (prof/profile (inner-main ["-t" "out/hmrc-rts-small-area.csv" "-u" "out/hmrc-rts-small-area.csv-metadata.json" "-m" "annotated" "-o" "cube.nt"])) + +;; The resulting flamegraph will be stored in /tmp/clj-async-profiler/results/ +;; You can view the SVG directly from there or start a local webserver: + +(prof/serve-files 8080) ; Serve on port 8080 + + ) diff --git a/src/csv2rdf/tabular/cell.clj b/src/csv2rdf/tabular/cell.clj index d784cfb..aa9c87c 100644 --- a/src/csv2rdf/tabular/cell.clj +++ b/src/csv2rdf/tabular/cell.clj @@ -67,12 +67,15 @@ (defn ^{:tabular-spec "6.4.9"} validate-length "Validates the length of the cell value is valid for the constraints on the column metadata" [{:keys [value datatype] :as cell-element}] - (if-let [len (xml-datatype/get-length value datatype)] - (let [len-errors (->> length-relations - (map (fn [[k sym]] (get-length-error cell-element sym len (get datatype k)))) - (remove nil?))] - (add-cell-errors cell-element len-errors)) - cell-element)) + (let [{:keys [length minLength maxLength]} datatype] + (if (or length minLength maxLength) + (if-let [len (xml-datatype/get-length value datatype)] + (let [len-errors (->> length-relations + (map (fn [[k sym]] (get-length-error cell-element sym len (get datatype k)))) + (remove nil?))] + (add-cell-errors cell-element len-errors)) + cell-element) + cell-element))) (defn ^{:tabular-spec "6.4.9"} validate-value-bounds "Validates the cell value is valid for any bounds specified on its datatype" diff --git a/src/csv2rdf/tabular/csv/reader.clj b/src/csv2rdf/tabular/csv/reader.clj index 26976b2..e8f1a91 100644 --- a/src/csv2rdf/tabular/csv/reader.clj +++ b/src/csv2rdf/tabular/csv/reader.clj @@ -286,30 +286,30 @@ to the dialect trim mode." [^String row-content source-row-number {:keys [^Character escapeChar ^Character quoteChar ^Character delimiter trim-mode] :as options}] (if (zero? (.length row-content)) - [] - (let [;;NOTE: cells are parsed slightly differently depending on - ;;whether the quote and escape characters are the same or - ;;different. Each parser parses the next cell contents - ;;from the specified starting position within the string - ;;and returns a map containing the parsed cell contents - ;;and the index within the string to continue - ;;parsing. This should always point to the delimiter - ;;character or be one passed the end of the string. - cell-parser (if (= escapeChar quoteChar) - parse-cell-double-quote - parse-cell-escape)] - (loop [idx 0 - cells []] - (let [{:keys [cell ^long next-index]} (cell-parser row-content source-row-number idx options)] - ;;if there is any remaining input, next-index should refer to the delimiter - ;;consume it and move to the start of the next cell - ;;otherwise entire row has been parsed - ;;TODO: move quoted cell followed by delimiter validation stuff here? - (if (< next-index (.length row-content)) - (let [next-char (.charAt row-content next-index)] - #_(assert (= delimiter next-char "Expected delimiter after parsed cell")) - (recur (inc next-index) (conj cells (trim-cell cell trim-mode)))) - (conj cells (trim-cell cell trim-mode)))))))) + [] + (let [ ;;NOTE: cells are parsed slightly differently depending on + ;;whether the quote and escape characters are the same or + ;;different. Each parser parses the next cell contents + ;;from the specified starting position within the string + ;;and returns a map containing the parsed cell contents + ;;and the index within the string to continue + ;;parsing. This should always point to the delimiter + ;;character or be one passed the end of the string. + cell-parser (if (= escapeChar quoteChar) + parse-cell-double-quote + parse-cell-escape)] + (persistent! (loop [idx 0 + cells (transient [])] + (let [{:keys [cell ^long next-index]} (cell-parser row-content source-row-number idx options)] + ;;if there is any remaining input, next-index should refer to the delimiter + ;;consume it and move to the start of the next cell + ;;otherwise entire row has been parsed + ;;TODO: move quoted cell followed by delimiter validation stuff here? + (if (< next-index (.length row-content)) + (let [next-char (.charAt row-content next-index)] + #_(assert (= delimiter next-char "Expected delimiter after parsed cell")) + (recur (inc next-index) (conj! cells (trim-cell cell trim-mode)))) + (conj! cells (trim-cell cell trim-mode))))))))) (s/def ::source-row-number (s/and integer? pos?)) (s/def ::cells (s/coll-of string? :kind vector? :into [])) diff --git a/src/csv2rdf/xml/datatype.clj b/src/csv2rdf/xml/datatype.clj index fb2d14c..b09d1c3 100644 --- a/src/csv2rdf/xml/datatype.clj +++ b/src/csv2rdf/xml/datatype.clj @@ -113,11 +113,11 @@ (if-let [root (find-root type-name)] (node-subtypes root))) -(defn is-subtype? +(def is-subtype? "Returns whether type b is a subtype of type a" - [a b] - (let [type-name (resolve-type-name b)] - (boolean (some #(= type-name %) (subtypes a))))) + (memoize (fn [a b] + (let [type-name (resolve-type-name b)] + (boolean (some #(= type-name %) (subtypes a))))))) (defn is-binary-type? [type-name] (contains? #{"hexBinary" "base64Binary"} (resolve-type-name type-name)))