Skip to content

Commit

Permalink
Merge pull request #58 from Swirrl/perf-improvements
Browse files Browse the repository at this point in the history
Perf improvements
  • Loading branch information
RickMoynihan authored May 12, 2021
2 parents bf428c0 + a725a9f commit c79bcb1
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 51 deletions.
7 changes: 6 additions & 1 deletion deps.edn
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@
{org.apache.logging.log4j/log4j-api {:mvn/version "2.11.0"}
org.apache.logging.log4j/log4j-core {:mvn/version "2.11.0"}
org.apache.logging.log4j/log4j-slf4j-impl {:mvn/version "2.11.0"}}}
:dev {}
:dev {
:extra-deps {com.clojure-goes-fast/clj-async-profiler {:mvn/version "0.5.0"}}

:jvm-opts ["-Djdk.attach.allowAttachSelf" ;; for jdk9+
]
}
:test
{:extra-deps
{lambdaisland/kaocha {:mvn/version "0.0-573"}
Expand Down
6 changes: 3 additions & 3 deletions src/csv2rdf/csvw.clj
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
(:import [org.eclipse.rdf4j.rio RDFFormat]))

(defn- get-table-statements [context {:keys [url dialect] :as table} table-group-dialect]
(let [dialect (or dialect table-group-dialect)]
(let [annotated-rows (csv/annotated-rows url table dialect)]
(table-statements context table annotated-rows))))
(let [dialect (or dialect table-group-dialect)
annotated-rows (csv/annotated-rows url table dialect)]
(table-statements context table annotated-rows)))

(defn csv->rdf
"Runs the CSVW process for the given tabular or metadata data sources and options. If metadata-source
Expand Down
4 changes: 3 additions & 1 deletion src/csv2rdf/csvw/common.clj
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
(:import [java.net URI]
[org.eclipse.rdf4j.model.impl URIImpl]))

(def bnode-id-counter (atom 0))

(defn gen-blank-node
"Generates a grafter representation of a new blank node"
([] (gen-blank-node "bnode"))
([prefix] (gproto/make-blank-node (str prefix (gensym)))))
([prefix] (gproto/make-blank-node (str prefix "__" (swap! bnode-id-counter inc)))))

(defn row-unsuppressed-cells
"Gets the all the cells within a row whose column output is not suppressed"
Expand Down
44 changes: 32 additions & 12 deletions src/csv2rdf/main.clj
Original file line number Diff line number Diff line change
Expand Up @@ -67,20 +67,40 @@
(println "Usage:")
(println summary)))

(defn- inner-main [args]
(let [options (parse-cli-options args)
{:keys [mode tabular user-metadata output-file]} options
opts {:tabular-source (some-> tabular parse-source)
:metadata-source (some-> user-metadata parse-source)
:rdf-format (or (some-> output-file formats/->rdf-format) RDFFormat/TURTLE)
:mode mode}
output-file (some-> output-file io/file)]
(if output-file
(with-open [w (io/writer output-file)]
(write-output w opts))
(write-output (io/writer *out*) opts))))

(defn- -main [& args]
(try
(let [options (parse-cli-options args)
{:keys [mode tabular user-metadata output-file]} options
opts {:tabular-source (some-> tabular parse-source)
:metadata-source (some-> user-metadata parse-source)
:rdf-format (or (some-> output-file formats/->rdf-format) RDFFormat/TURTLE)
:mode mode}
output-file (some-> output-file io/file)]
(if output-file
(with-open [w (io/writer output-file)]
(write-output w opts))
(write-output (io/writer *out*) opts))
(System/exit 0))
(inner-main args)
(System/exit 0)
(catch Throwable ex
(display-error ex)
(System/exit 1))))


(comment

(time (inner-main ["-t" "out/hmrc-rts-small-area.csv" "-u" "out/hmrc-rts-small-area.csv-metadata.json" "-m" "annotated" "-o" "cube.nt"]))

(require '[clj-async-profiler.core :as prof])

;; Profile the following expression:
(prof/profile (inner-main ["-t" "out/hmrc-rts-small-area.csv" "-u" "out/hmrc-rts-small-area.csv-metadata.json" "-m" "annotated" "-o" "cube.nt"]))

;; The resulting flamegraph will be stored in /tmp/clj-async-profiler/results/
;; You can view the SVG directly from there or start a local webserver:

(prof/serve-files 8080) ; Serve on port 8080

)
15 changes: 9 additions & 6 deletions src/csv2rdf/tabular/cell.clj
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,15 @@
(defn ^{:tabular-spec "6.4.9"} validate-length
"Validates the length of the cell value is valid for the constraints on the column metadata"
[{:keys [value datatype] :as cell-element}]
(if-let [len (xml-datatype/get-length value datatype)]
(let [len-errors (->> length-relations
(map (fn [[k sym]] (get-length-error cell-element sym len (get datatype k))))
(remove nil?))]
(add-cell-errors cell-element len-errors))
cell-element))
(let [{:keys [length minLength maxLength]} datatype]
(if (or length minLength maxLength)
(if-let [len (xml-datatype/get-length value datatype)]
(let [len-errors (->> length-relations
(map (fn [[k sym]] (get-length-error cell-element sym len (get datatype k))))
(remove nil?))]
(add-cell-errors cell-element len-errors))
cell-element)
cell-element)))

(defn ^{:tabular-spec "6.4.9"} validate-value-bounds
"Validates the cell value is valid for any bounds specified on its datatype"
Expand Down
48 changes: 24 additions & 24 deletions src/csv2rdf/tabular/csv/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -286,30 +286,30 @@
to the dialect trim mode."
[^String row-content source-row-number {:keys [^Character escapeChar ^Character quoteChar ^Character delimiter trim-mode] :as options}]
(if (zero? (.length row-content))
[]
(let [;;NOTE: cells are parsed slightly differently depending on
;;whether the quote and escape characters are the same or
;;different. Each parser parses the next cell contents
;;from the specified starting position within the string
;;and returns a map containing the parsed cell contents
;;and the index within the string to continue
;;parsing. This should always point to the delimiter
;;character or be one passed the end of the string.
cell-parser (if (= escapeChar quoteChar)
parse-cell-double-quote
parse-cell-escape)]
(loop [idx 0
cells []]
(let [{:keys [cell ^long next-index]} (cell-parser row-content source-row-number idx options)]
;;if there is any remaining input, next-index should refer to the delimiter
;;consume it and move to the start of the next cell
;;otherwise entire row has been parsed
;;TODO: move quoted cell followed by delimiter validation stuff here?
(if (< next-index (.length row-content))
(let [next-char (.charAt row-content next-index)]
#_(assert (= delimiter next-char "Expected delimiter after parsed cell"))
(recur (inc next-index) (conj cells (trim-cell cell trim-mode))))
(conj cells (trim-cell cell trim-mode))))))))
[]
(let [ ;;NOTE: cells are parsed slightly differently depending on
;;whether the quote and escape characters are the same or
;;different. Each parser parses the next cell contents
;;from the specified starting position within the string
;;and returns a map containing the parsed cell contents
;;and the index within the string to continue
;;parsing. This should always point to the delimiter
;;character or be one passed the end of the string.
cell-parser (if (= escapeChar quoteChar)
parse-cell-double-quote
parse-cell-escape)]
(persistent! (loop [idx 0
cells (transient [])]
(let [{:keys [cell ^long next-index]} (cell-parser row-content source-row-number idx options)]
;;if there is any remaining input, next-index should refer to the delimiter
;;consume it and move to the start of the next cell
;;otherwise entire row has been parsed
;;TODO: move quoted cell followed by delimiter validation stuff here?
(if (< next-index (.length row-content))
(let [next-char (.charAt row-content next-index)]
#_(assert (= delimiter next-char "Expected delimiter after parsed cell"))
(recur (inc next-index) (conj! cells (trim-cell cell trim-mode))))
(conj! cells (trim-cell cell trim-mode)))))))))

(s/def ::source-row-number (s/and integer? pos?))
(s/def ::cells (s/coll-of string? :kind vector? :into []))
Expand Down
8 changes: 4 additions & 4 deletions src/csv2rdf/xml/datatype.clj
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,11 @@
(if-let [root (find-root type-name)]
(node-subtypes root)))

(defn is-subtype?
(def is-subtype?
"Returns whether type b is a subtype of type a"
[a b]
(let [type-name (resolve-type-name b)]
(boolean (some #(= type-name %) (subtypes a)))))
(memoize (fn [a b]
(let [type-name (resolve-type-name b)]
(boolean (some #(= type-name %) (subtypes a)))))))

(defn is-binary-type? [type-name]
(contains? #{"hexBinary" "base64Binary"} (resolve-type-name type-name)))
Expand Down

0 comments on commit c79bcb1

Please sign in to comment.