Performance fixes for #659 (#684)

* refactor: construct children collection once * lint-fix: specify return type of isRoot directly * refactor: further performance & style improvements * refactor: helpless panic * refactor: another iteration of urgh * Revert "refactor: another iteration of urgh" This reverts commit e0bf428. * refactor: swapping tries for gods sake * Revert "refactor: swapping tries for gods sake" This reverts commit b77cebe. * Reapply "refactor: another iteration of urgh" This reverts commit 63c19a2. * refactor(wip): interim mental breakdown * refactor: at least... happy tests? * refactor: lock jsonlite options * refactor(cfg-test): remove fake test * refactor: try manual conversion, is worse * Revert "refactor: try manual conversion, is worse" This reverts commit 578c64e. * refactor: simplify json conversion * refactor: several minor performance tunes :3 * refactor: ensure the executor dies directly (and gracefully) * refactor: dynamic `flowr_get` compile * refactor: fine-tune cmp retrieval * refactor: reduce size of json produce, defer obj map to flowR * doc(test): remove dead jsdoc comment * refactor: clean up tmp logfiles * lint-fix: handle minor linter problems * refactor: wordly clean-up * feat: jsonlite begone! * refactor: further improve print-out-performance * refactor: always compile seems to be faster! * refactor: use build for benchmark run * refactor: switch back to `sort` * feat(tests): cover for optional xmlparsedata availability --------- Co-authored-by: Florian Sihler <florian.sihler@uni-ulm.de>
flowr-analysis · Feb 24, 2024 · bfd8f12 · bfd8f12 · github-actions · Feb 24, 2024
1 parent 96bbd86
commit bfd8f12
Show file tree

Hide file tree

Showing 51 changed files with 255 additions and 257 deletions.
diff --git a/.github/actions/setup/action.yaml b/.github/actions/setup/action.yaml
@@ -29,3 +29,8 @@ runs:
       uses: r-lib/actions/setup-r@v2
       with:
         r-version: ${{ inputs.r-version  }}
+
+    - name: Install R packages
+      if: ${{ inputs.r-version != '' }}
+      shell: Rscript {0}
+      run: install.packages("xmlparsedata", repos="https://cloud.r-project.org/")
diff --git a/package.json b/package.json
@@ -19,7 +19,7 @@
     "slicer": "ts-node src/cli/slicer-app.ts",
     "release": "release-it --ci",
     "benchmark-helper": "ts-node src/cli/benchmark-helper-app.ts",
-    "benchmark": "ts-node src/cli/benchmark-app.ts",
+    "benchmark": "npm run build && node dist/src/cli/benchmark-app.js",
     "summarizer": "ts-node src/cli/summarizer-app.ts",
     "export-quads": "ts-node src/cli/export-quads-app.ts",
     "build": "tsc --project .",

diff --git a/src/benchmark/slicer.ts b/src/benchmark/slicer.ts
@@ -117,7 +117,6 @@ export class BenchmarkSlicer {
 		this.loadedXml = await this.measureCommonStep('parse', 'retrieve AST from R code')
 		this.normalizedAst = await this.measureCommonStep('normalize', 'normalize R AST')
 		this.dataflow = await this.measureCommonStep('dataflow', 'produce dataflow information')
-		this.ai = await this.measureCommonStep('ai', 'run abstract interpretation')
 
 		this.stepper.switchToSliceStage()
 

diff --git a/src/benchmark/stats/stats.ts b/src/benchmark/stats/stats.ts
@@ -1,8 +1,7 @@
-import type { SingleSlicingCriterion, SlicingCriteria } from '../../slicing'
+import type { ReconstructionResult, SingleSlicingCriterion, SlicingCriteria } from '../../slicing'
 import type { NodeId, RParseRequestFromFile, RParseRequestFromText } from '../../r-bridge'
-import type { ReconstructionResult } from '../../slicing'
 
-export const CommonSlicerMeasurements = ['initialize R session', 'retrieve AST from R code', 'normalize R AST', 'produce dataflow information', 'run abstract interpretation', 'close R session', 'total'] as const
+export const CommonSlicerMeasurements = ['initialize R session', 'retrieve AST from R code', 'normalize R AST', 'produce dataflow information', 'close R session', 'total'] as const
 export type CommonSlicerMeasurements = typeof CommonSlicerMeasurements[number]
 
 export const PerSliceMeasurements = ['static slicing', 'reconstruct code', 'total'] as const

diff --git a/src/cli/benchmark-helper-app.ts b/src/cli/benchmark-helper-app.ts
@@ -42,7 +42,7 @@ async function benchmark() {
 	const fileStat = fs.statSync(options.input)
 	guard(fileStat.isFile(), `File ${options.input} does not exist or is no file`)
 
-	const request = { request: 'file', content: options.input } as RParseRequestFromFile
+	const request: RParseRequestFromFile = { request: 'file', content: options.input }
 
 	const slicer = new BenchmarkSlicer()
 	try {

diff --git a/src/cli/repl/commands/parse.ts b/src/cli/repl/commands/parse.ts
@@ -1,9 +1,6 @@
 import type { XmlBasedJson} from '../../../r-bridge'
 import {childrenKey} from '../../../r-bridge'
 import {attributesKey, contentKey} from '../../../r-bridge'
-import {
-	parseCSV
-} from '../../../r-bridge'
 import {getKeysGuarded, RawRType, requestFromInput} from '../../../r-bridge'
 import {
 	extractLocation,
@@ -14,8 +11,8 @@ import type { OutputFormatter } from '../../../statistics'
 import { FontStyles } from '../../../statistics'
 import type { ReplCommand } from './main'
 import { SteppingSlicer } from '../../../core'
-import {csvToRecord} from '../../../r-bridge/lang-4.x/ast/parser/csv/format'
-import {convertToXmlBasedJson} from '../../../r-bridge/lang-4.x/ast/parser/csv/parser'
+import {prepareParsedData} from '../../../r-bridge/lang-4.x/ast/parser/json/format'
+import {convertPreparedParsedData} from '../../../r-bridge/lang-4.x/ast/parser/json/parser'
 
 type DepthList =  { depth: number, node: XmlBasedJson, leaf: boolean }[]
 
@@ -132,7 +129,7 @@ export const parseCommand: ReplCommand = {
 			request:        requestFromInput(remainingLine.trim())
 		}).allRemainingSteps()
 
-		const object = convertToXmlBasedJson(csvToRecord(parseCSV(result.parse)))
+		const object = convertPreparedParsedData(prepareParsedData(result.parse))
 
 		output.stdout(depthListToTextTree(toDepthMap(object), output.formatter))
 	}

diff --git a/src/cli/repl/server/connection.ts b/src/cli/repl/server/connection.ts
@@ -140,8 +140,7 @@ export class FlowRServerConnection {
 				results: {
 					parse:     await printStepResult('parse', results.parse as string, StepOutputFormat.RdfQuads, config()),
 					normalize: await printStepResult('normalize', results.normalize as NormalizedAst, StepOutputFormat.RdfQuads, config()),
-					dataflow:  await printStepResult('dataflow', results.dataflow as DataflowInformation, StepOutputFormat.RdfQuads, config()),
-					ai:        ''
+					dataflow:  await printStepResult('dataflow', results.dataflow as DataflowInformation, StepOutputFormat.RdfQuads, config())
 				}
 			})
 		} else {

diff --git a/src/core/input.ts b/src/core/input.ts
@@ -29,7 +29,7 @@ interface BaseSteppingSlicerInput<InterestedIn extends StepName | undefined> ext
 	autoSelectIf?:   AutoSelectPredicate
 }
 
-interface NormalizeSteppingSlicerInput<InterestedIn extends 'ai' | 'dataflow' | 'normalize'> extends BaseSteppingSlicerInput<InterestedIn> {
+interface NormalizeSteppingSlicerInput<InterestedIn extends 'dataflow' | 'normalize'> extends BaseSteppingSlicerInput<InterestedIn> {
 	stepOfInterest: InterestedIn
 }
 

diff --git a/src/core/output.ts b/src/core/output.ts
@@ -14,7 +14,6 @@ type StepResultsHelper<InterestedIn extends StepName> = {
 	'parse':       Out<'parse'>
 	'normalize':   StepResultsHelper<'parse'> & Out<'normalize'>
 	'dataflow':    StepResultsHelper<'normalize'> & Out<'dataflow'>
-	'ai':          StepResultsHelper<'dataflow'> & Out<'ai'>
-	'slice':       StepResultsHelper<'ai'> & Out<'slice'>
+	'slice':       StepResultsHelper<'dataflow'> & Out<'slice'>
 	'reconstruct': StepResultsHelper<'slice'> & Out<'reconstruct'>
 }[InterestedIn]
diff --git a/src/core/print/parse-printer.ts b/src/core/print/parse-printer.ts
@@ -2,9 +2,8 @@ import type { QuadSerializationConfiguration} from '../../util/quads'
 import { serialize2quads } from '../../util/quads'
 import type { XmlBasedJson} from '../../r-bridge'
 import {attributesKey, childrenKey, contentKey} from '../../r-bridge'
-import {parseCSV} from '../../r-bridge'
-import {csvToRecord} from '../../r-bridge/lang-4.x/ast/parser/csv/format'
-import {convertToXmlBasedJson} from '../../r-bridge/lang-4.x/ast/parser/csv/parser'
+import {prepareParsedData} from '../../r-bridge/lang-4.x/ast/parser/json/format'
+import {convertPreparedParsedData} from '../../r-bridge/lang-4.x/ast/parser/json/parser'
 
 function filterObject(obj: XmlBasedJson, keys: Set<string>): XmlBasedJson[] | XmlBasedJson {
 	if(typeof obj !== 'object') {
@@ -28,7 +27,7 @@ function filterObject(obj: XmlBasedJson, keys: Set<string>): XmlBasedJson[] | Xm
 }
 
 export function parseToQuads(code: string, config: QuadSerializationConfiguration): string{
-	const obj = convertToXmlBasedJson(csvToRecord(parseCSV(code)))
+	const obj = convertPreparedParsedData(prepareParsedData(code))
 	// recursively filter so that if the object contains one of the keys 'a', 'b' or 'c', all other keys are ignored
 	return serialize2quads(
 		filterObject(obj, new Set([attributesKey, childrenKey, contentKey])) as XmlBasedJson,

diff --git a/src/core/slicer.ts b/src/core/slicer.ts
@@ -211,15 +211,11 @@ export class SteppingSlicer<InterestedIn extends StepName | undefined = typeof L
 				result = executeSingleSubStep(step, this.request, this.results.normalize as NormalizedAst)
 				break
 			case 3:
-				step = guardStep('ai')
-				result = executeSingleSubStep(step, this.results.normalize as NormalizedAst, this.results.dataflow as DataflowInformation)
-				break
-			case 4:
 				guard(this.criterion !== undefined, 'Cannot decode criteria without a criterion')
 				step = guardStep('slice')
-				result = executeSingleSubStep(step, (this.results.ai as DataflowInformation).graph, this.results.normalize as NormalizedAst, this.criterion)
+				result = executeSingleSubStep(step, (this.results.dataflow as DataflowInformation).graph, this.results.normalize as NormalizedAst, this.criterion)
 				break
-			case 5:
+			case 4:
 				step = guardStep('reconstruct')
 				result = executeSingleSubStep(step, this.results.normalize as NormalizedAst<NoInfo>, (this.results.slice as SliceResult).result)
 				break

diff --git a/src/core/steps.ts b/src/core/steps.ts
@@ -14,7 +14,7 @@
  */
 
 import type { MergeableRecord } from '../util/objects'
-import { retrieveCsvFromRCode } from '../r-bridge'
+import { retrieveParseDataFromRCode } from '../r-bridge'
 import { produceDataFlowGraph } from '../dataflow'
 import { reconstructToCode, staticSlicing } from '../slicing'
 import type { IStepPrinter} from './print/print'
@@ -33,9 +33,7 @@ import {
 	dataflowGraphToMermaidUrl,
 	dataflowGraphToQuads
 } from './print/dataflow-printer'
-import type {DataflowInformation} from '../dataflow/internal/info'
-import type {runAbstractInterpretation} from '../abstract-interpretation/processor'
-import {normalize} from '../r-bridge/lang-4.x/ast/parser/csv/parser'
+import {normalize} from '../r-bridge/lang-4.x/ast/parser/json/parser'
 
 /**
  * This represents close a function that we know completely nothing about.
@@ -72,14 +70,14 @@ export interface IStep<
 export const STEPS_PER_FILE = {
 	'parse': {
 		description: 'Parse the given R code into an AST',
-		processor:   retrieveCsvFromRCode,
+		processor:   retrieveParseDataFromRCode,
 		required:    'once-per-file',
 		printer:     {
 			[StepOutputFormat.Internal]: internalPrinter,
 			[StepOutputFormat.Json]:     text => text,
 			[StepOutputFormat.RdfQuads]: parseToQuads
 		}
-	} satisfies IStep<typeof retrieveCsvFromRCode>,
+	} satisfies IStep<typeof retrieveParseDataFromRCode>,
 	'normalize': {
 		description: 'Normalize the AST to flowR\'s AST (first step of the normalization)',
 		processor:   normalize,
@@ -103,15 +101,7 @@ export const STEPS_PER_FILE = {
 			[StepOutputFormat.Mermaid]:    dataflowGraphToMermaid,
 			[StepOutputFormat.MermaidUrl]: dataflowGraphToMermaidUrl
 		}
-	} satisfies IStep<typeof produceDataFlowGraph>,
-	'ai': {
-		description: 'Run abstract interpretation',
-		processor:   (_, dfInfo: DataflowInformation) => dfInfo, // Use runAbstractInterpretation here when it's ready
-		required:    'once-per-file',
-		printer:     {
-			[StepOutputFormat.Internal]: internalPrinter
-		}
-	} satisfies IStep<typeof runAbstractInterpretation>
+	} satisfies IStep<typeof produceDataFlowGraph>
 } as const
 
 export const STEPS_PER_SLICE = {
@@ -134,7 +124,7 @@ export const STEPS_PER_SLICE = {
 } as const
 
 export const STEPS = { ...STEPS_PER_FILE, ...STEPS_PER_SLICE } as const
-export const LAST_PER_FILE_STEP = 'ai' as const
+export const LAST_PER_FILE_STEP = 'dataflow' as const
 export const LAST_STEP = 'reconstruct' as const
 
 export type StepName = keyof typeof STEPS

diff --git a/src/r-bridge/lang-4.x/ast/index.ts b/src/r-bridge/lang-4.x/ast/index.ts
@@ -1,3 +1,3 @@
 export * from './model'
 export * from './parser/xml'
-export {parseLog} from './parser/csv/parser'
+export {parseLog} from './parser/json/parser'
diff --git a/src/r-bridge/lang-4.x/ast/parser/csv/format.ts b/src/r-bridge/lang-4.x/ast/parser/csv/format.ts
diff --git a/src/r-bridge/lang-4.x/ast/parser/json/format.ts b/src/r-bridge/lang-4.x/ast/parser/json/format.ts
@@ -0,0 +1,41 @@
+import { removeTokenMapQuotationMarks } from '../../../../retriever'
+import { guard } from '../../../../../util/assert'
+
+export const RootId = 0
+
+export interface Entry extends Record<string, unknown> {
+	line1:     number,
+	col1:      number,
+	line2:     number,
+	col2:      number,
+	id:        number,
+	parent:    number,
+	token:     string,
+	terminal:  boolean,
+	text:      string,
+	children?: Entry[]
+}
+
+type ParsedDataRow = [line1: number, col1: number, line2: number, col2: number, id: number, parent: number, token: string, terminal: boolean, text: string]
+
+export function prepareParsedData(data: string): Map<number, Entry> {
+	const json: unknown = JSON.parse(data)
+	guard(Array.isArray(json), () => `Expected ${data} to be an array but was not`)
+
+	const ret = new Map<number, Entry>((json as ParsedDataRow[]).map(([line1, col1, line2, col2, id, parent, token, terminal, text]) => {
+		return [id, { line1, col1, line2, col2, id, parent, token: removeTokenMapQuotationMarks(token), terminal, text }] satisfies [number, Entry]
+	}))
+
+	// iterate a second time to set parent-child relations (since they may be out of order in the csv)
+	for(const entry of ret.values()) {
+		if(entry.parent != RootId) {
+			const parent = ret.get(entry.parent)
+			if(parent) {
+				parent.children ??= []
+				parent.children.push(entry)
+			}
+		}
+	}
+
+	return ret
+}
diff --git a/...-bridge/lang-4.x/ast/parser/csv/parser.ts → ...bridge/lang-4.x/ast/parser/json/parser.ts b/...-bridge/lang-4.x/ast/parser/csv/parser.ts → ...bridge/lang-4.x/ast/parser/json/parser.ts
@@ -7,34 +7,36 @@ import {DEFAULT_PARSER_HOOKS, type ParserData} from '../xml'
 import type {IdGenerator, NoInfo} from '../../model'
 import {decorateAst, deterministicCountingIdGenerator, type NormalizedAst} from '../../model'
 import {deepMergeObject} from '../../../../../util/objects'
-import type {CsvEntry} from './format'
-import {csvToRecord, getChildren, type ParsedCsv} from './format'
-import {parseCSV} from '../../../values'
+import type { Entry} from './format'
+import { RootId, prepareParsedData } from './format'
 import {parseRootObjToAst} from '../xml/internal'
 import {log} from '../../../../../util/log'
 
 export const parseLog = log.getSubLogger({name: 'ast-parser'})
 
-export function normalize(csvString: string, hooks?: DeepPartial<XmlParserHooks>, getId: IdGenerator<NoInfo> = deterministicCountingIdGenerator(0)): NormalizedAst {
+export function normalize(jsonString: string, hooks?: DeepPartial<XmlParserHooks>, getId: IdGenerator<NoInfo> = deterministicCountingIdGenerator(0)): NormalizedAst {
 	const hooksWithDefaults = deepMergeObject(DEFAULT_PARSER_HOOKS, hooks) as XmlParserHooks
 
 	const data: ParserData = { hooks: hooksWithDefaults, currentRange: undefined, currentLexeme: undefined }
-	const object = convertToXmlBasedJson(csvToRecord(parseCSV(csvString)))
+	const object = convertPreparedParsedData(prepareParsedData(jsonString))
 
 	return decorateAst(parseRootObjToAst(data, object), getId)
 }
 
-export function convertToXmlBasedJson(csv: ParsedCsv): XmlBasedJson{
+export function convertPreparedParsedData(valueMapping: Map<number, Entry>): XmlBasedJson {
 	const exprlist: XmlBasedJson =  {}
 	exprlist[nameKey] = 'exprlist'
-	exprlist[childrenKey] = Object.values(csv)
-		// we convert all roots, which are entries with parent 0
-		.filter(v => v.parent == 0)
-		.map(v => convertEntry(v, csv))
+	const children = []
+	for(const entry of valueMapping.values()) {
+		if(entry.parent == RootId) {
+			children.push(convertEntry(entry))
+		}
+	}
+	exprlist[childrenKey] = children
 	return {'exprlist': exprlist}
 }
 
-function convertEntry(csvEntry: CsvEntry, csv: ParsedCsv): XmlBasedJson {
+function convertEntry(csvEntry: Entry): XmlBasedJson {
 	const xmlEntry: XmlBasedJson = {}
 
 	xmlEntry[attributesKey] = {
@@ -49,19 +51,18 @@ function convertEntry(csvEntry: CsvEntry, csv: ParsedCsv): XmlBasedJson {
 	}
 
 	// check and recursively iterate children
-	const children = getChildren(csv, csvEntry)
-	if(children && children.length > 0){
-		xmlEntry[childrenKey] = children
+	if(csvEntry.children && csvEntry.children.length > 0){
+		xmlEntry[childrenKey] = csvEntry.children
 			// we sort children the same way xmlparsedata does (by line, by column, by inverse end line, by inverse end column, by terminal state, by combined "start" tiebreaker value)
 			// (https://github.com/r-lib/xmlparsedata/blob/main/R/package.R#L153C72-L153C78)
 			.sort((c1,c2) => c1.line1-c2.line1 || c1.col1-c2.col1 || c2.line2-c1.line2 || c2.col2-c1.col2 || Number(c1.terminal)-Number(c2.terminal) || sortTiebreak(c1)-sortTiebreak(c2))
-			.map(c => convertEntry(c, csv))
+			.map(convertEntry)
 	}
 
 	return xmlEntry
 }
 
-function sortTiebreak(entry: CsvEntry){
+function sortTiebreak(entry: Entry) {
 	// see https://github.com/r-lib/xmlparsedata/blob/main/R/package.R#L110C5-L110C11
 	return entry.line1 * (Math.max(entry.col1, entry.col2) + 1) + entry.col1
 }
Benchmark suite	Current: `bfd8f12`	Previous: `d69018e`	Ratio
`Total per-file`	`1516.6973344545454` ms (`3765.211119967862`)	`1511.494083` ms (`3708.3225553463017`)	`1.00`
`Retrieve AST from R code`	`307.00354272727276` ms (`261.0172798892817`)	`64.46248863636363` ms (`125.66414120100016`)	`4.76`
`Normalize R AST`	`33.07620031818182` ms (`64.09281603038194`)	`94.99519236363636` ms (`152.9376581920758`)	`0.35`
`Produce dataflow information`	`70.26002963636364` ms (`175.12875389336065`)	`65.2556795909091` ms (`167.18441854609554`)	`1.08`
`Total per-slice`	`1.8461127814688914` ms (`1.337637708289551`)	`1.8724288794806876` ms (`1.3873679811565907`)	`0.99`
`Static slicing`	`1.3818304625094695` ms (`1.2825199074357059`)	`1.4074784311593942` ms (`1.3118563756339259`)	`0.98`
`Reconstruct code`	`0.45033915063385405` ms (`0.19568096194174467`)	`0.4524929302663976` ms (`0.22636683004337768`)	`1.00`
`failed to reconstruct/re-parse`	`0` #	`0` #	`1`
`times hit threshold`	`0` #	`0` #	`1`
`reduction (characters)`	`0.7329390759026896` #	`0.7329390759026896` #	`1`
`reduction (normalized tokens)`	`0.7209834969577295` #	`0.720988345209971` #	`1.00`
Benchmark suite	Current: `bfd8f12`	Previous: `d69018e`	Ratio
`Total per-file`	`3707.1666103000002` ms (`6215.042229911342`)	`3566.86774236` ms (`5920.286185213901`)	`1.04`
`Retrieve AST from R code`	`314.40706378` ms (`119.16372206117796`)	`72.22227936` ms (`60.97026629229811`)	`4.35`
`Normalize R AST`	`34.71123942` ms (`30.592799884980952`)	`113.02594858` ms (`70.71306906384982`)	`0.31`
`Produce dataflow information`	`184.95076519999998` ms (`282.02234968397937`)	`163.44175874` ms (`276.9623037407309`)	`1.13`
`Total per-slice`	`9.105993052874766` ms (`14.934544115063787`)	`8.599255365044066` ms (`14.312877376595168`)	`1.06`
`Static slicing`	`8.59434534848267` ms (`14.811448973463916`)	`8.071953766135923` ms (`14.188089279803133`)	`1.06`
`Reconstruct code`	`0.5026473785153759` ms (`0.23981576556699372`)	`0.5187709959800451` ms (`0.27627204677573897`)	`0.97`
`failed to reconstruct/re-parse`	`9` #	`9` #	`1`
`times hit threshold`	`967` #	`967` #	`1`
`reduction (characters)`	`0.8935817303062393` #	`0.898713819973478` #	`0.99`
`reduction (normalized tokens)`	`0.8531248144961374` #	`0.8579790415512589` #	`0.99`