Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Perf improvements #58

Merged
merged 7 commits into from
May 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion deps.edn
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@
{org.apache.logging.log4j/log4j-api {:mvn/version "2.11.0"}
org.apache.logging.log4j/log4j-core {:mvn/version "2.11.0"}
org.apache.logging.log4j/log4j-slf4j-impl {:mvn/version "2.11.0"}}}
:dev {}
:dev {
:extra-deps {com.clojure-goes-fast/clj-async-profiler {:mvn/version "0.5.0"}}

:jvm-opts ["-Djdk.attach.allowAttachSelf" ;; for jdk9+
]
}
:test
{:extra-deps
{lambdaisland/kaocha {:mvn/version "0.0-573"}
Expand Down
6 changes: 3 additions & 3 deletions src/csv2rdf/csvw.clj
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
(:import [org.eclipse.rdf4j.rio RDFFormat]))

(defn- get-table-statements [context {:keys [url dialect] :as table} table-group-dialect]
(let [dialect (or dialect table-group-dialect)]
(let [annotated-rows (csv/annotated-rows url table dialect)]
(table-statements context table annotated-rows))))
(let [dialect (or dialect table-group-dialect)
annotated-rows (csv/annotated-rows url table dialect)]
(table-statements context table annotated-rows)))

(defn csv->rdf
"Runs the CSVW process for the given tabular or metadata data sources and options. If metadata-source
Expand Down
4 changes: 3 additions & 1 deletion src/csv2rdf/csvw/common.clj
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
(:import [java.net URI]
[org.eclipse.rdf4j.model.impl URIImpl]))

(def bnode-id-counter (atom 0))

(defn gen-blank-node
"Generates a grafter representation of a new blank node"
([] (gen-blank-node "bnode"))
([prefix] (gproto/make-blank-node (str prefix (gensym)))))
([prefix] (gproto/make-blank-node (str prefix "__" (swap! bnode-id-counter inc)))))

(defn row-unsuppressed-cells
"Gets the all the cells within a row whose column output is not suppressed"
Expand Down
44 changes: 32 additions & 12 deletions src/csv2rdf/main.clj
Original file line number Diff line number Diff line change
Expand Up @@ -67,20 +67,40 @@
(println "Usage:")
(println summary)))

(defn- inner-main [args]
(let [options (parse-cli-options args)
{:keys [mode tabular user-metadata output-file]} options
opts {:tabular-source (some-> tabular parse-source)
:metadata-source (some-> user-metadata parse-source)
:rdf-format (or (some-> output-file formats/->rdf-format) RDFFormat/TURTLE)
:mode mode}
output-file (some-> output-file io/file)]
(if output-file
(with-open [w (io/writer output-file)]
(write-output w opts))
(write-output (io/writer *out*) opts))))

(defn- -main [& args]
(try
(let [options (parse-cli-options args)
{:keys [mode tabular user-metadata output-file]} options
opts {:tabular-source (some-> tabular parse-source)
:metadata-source (some-> user-metadata parse-source)
:rdf-format (or (some-> output-file formats/->rdf-format) RDFFormat/TURTLE)
:mode mode}
output-file (some-> output-file io/file)]
(if output-file
(with-open [w (io/writer output-file)]
(write-output w opts))
(write-output (io/writer *out*) opts))
(System/exit 0))
(inner-main args)
(System/exit 0)
(catch Throwable ex
(display-error ex)
(System/exit 1))))


(comment

(time (inner-main ["-t" "out/hmrc-rts-small-area.csv" "-u" "out/hmrc-rts-small-area.csv-metadata.json" "-m" "annotated" "-o" "cube.nt"]))

(require '[clj-async-profiler.core :as prof])

;; Profile the following expression:
(prof/profile (inner-main ["-t" "out/hmrc-rts-small-area.csv" "-u" "out/hmrc-rts-small-area.csv-metadata.json" "-m" "annotated" "-o" "cube.nt"]))

;; The resulting flamegraph will be stored in /tmp/clj-async-profiler/results/
;; You can view the SVG directly from there or start a local webserver:

(prof/serve-files 8080) ; Serve on port 8080

)
15 changes: 9 additions & 6 deletions src/csv2rdf/tabular/cell.clj
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,15 @@
(defn ^{:tabular-spec "6.4.9"} validate-length
"Validates the length of the cell value is valid for the constraints on the column metadata"
[{:keys [value datatype] :as cell-element}]
(if-let [len (xml-datatype/get-length value datatype)]
(let [len-errors (->> length-relations
(map (fn [[k sym]] (get-length-error cell-element sym len (get datatype k))))
(remove nil?))]
(add-cell-errors cell-element len-errors))
cell-element))
(let [{:keys [length minLength maxLength]} datatype]
(if (or length minLength maxLength)
(if-let [len (xml-datatype/get-length value datatype)]
(let [len-errors (->> length-relations
(map (fn [[k sym]] (get-length-error cell-element sym len (get datatype k))))
(remove nil?))]
(add-cell-errors cell-element len-errors))
cell-element)
cell-element)))

(defn ^{:tabular-spec "6.4.9"} validate-value-bounds
"Validates the cell value is valid for any bounds specified on its datatype"
Expand Down
48 changes: 24 additions & 24 deletions src/csv2rdf/tabular/csv/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -286,30 +286,30 @@
to the dialect trim mode."
[^String row-content source-row-number {:keys [^Character escapeChar ^Character quoteChar ^Character delimiter trim-mode] :as options}]
(if (zero? (.length row-content))
[]
(let [;;NOTE: cells are parsed slightly differently depending on
;;whether the quote and escape characters are the same or
;;different. Each parser parses the next cell contents
;;from the specified starting position within the string
;;and returns a map containing the parsed cell contents
;;and the index within the string to continue
;;parsing. This should always point to the delimiter
;;character or be one passed the end of the string.
cell-parser (if (= escapeChar quoteChar)
parse-cell-double-quote
parse-cell-escape)]
(loop [idx 0
cells []]
(let [{:keys [cell ^long next-index]} (cell-parser row-content source-row-number idx options)]
;;if there is any remaining input, next-index should refer to the delimiter
;;consume it and move to the start of the next cell
;;otherwise entire row has been parsed
;;TODO: move quoted cell followed by delimiter validation stuff here?
(if (< next-index (.length row-content))
(let [next-char (.charAt row-content next-index)]
#_(assert (= delimiter next-char "Expected delimiter after parsed cell"))
(recur (inc next-index) (conj cells (trim-cell cell trim-mode))))
(conj cells (trim-cell cell trim-mode))))))))
[]
(let [ ;;NOTE: cells are parsed slightly differently depending on
;;whether the quote and escape characters are the same or
;;different. Each parser parses the next cell contents
;;from the specified starting position within the string
;;and returns a map containing the parsed cell contents
;;and the index within the string to continue
;;parsing. This should always point to the delimiter
;;character or be one passed the end of the string.
cell-parser (if (= escapeChar quoteChar)
parse-cell-double-quote
parse-cell-escape)]
(persistent! (loop [idx 0
cells (transient [])]
(let [{:keys [cell ^long next-index]} (cell-parser row-content source-row-number idx options)]
;;if there is any remaining input, next-index should refer to the delimiter
;;consume it and move to the start of the next cell
;;otherwise entire row has been parsed
;;TODO: move quoted cell followed by delimiter validation stuff here?
(if (< next-index (.length row-content))
(let [next-char (.charAt row-content next-index)]
#_(assert (= delimiter next-char "Expected delimiter after parsed cell"))
(recur (inc next-index) (conj! cells (trim-cell cell trim-mode))))
(conj! cells (trim-cell cell trim-mode)))))))))

(s/def ::source-row-number (s/and integer? pos?))
(s/def ::cells (s/coll-of string? :kind vector? :into []))
Expand Down
8 changes: 4 additions & 4 deletions src/csv2rdf/xml/datatype.clj
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,11 @@
(if-let [root (find-root type-name)]
(node-subtypes root)))

(defn is-subtype?
(def is-subtype?
"Returns whether type b is a subtype of type a"
[a b]
(let [type-name (resolve-type-name b)]
(boolean (some #(= type-name %) (subtypes a)))))
(memoize (fn [a b]
(let [type-name (resolve-type-name b)]
(boolean (some #(= type-name %) (subtypes a)))))))

(defn is-binary-type? [type-name]
(contains? #{"hexBinary" "base64Binary"} (resolve-type-name type-name)))
Expand Down