Skip to content

Commit

Permalink
Feat: Follow up on source calls (#609)
Browse files Browse the repository at this point in the history
* wip: added testfile & started on environment (??)

* wip: some todos

* wip: start of source argument resolve

* wip: added testfile & started on environment (??)

* wip: some todos

* wip: start of source argument resolve

* refactor: start using sync r shell implementation

* wip: rebase

* refactor: start using sync r shell implementation

* wip: use executeSingleSubStep for parsing sourced code

* wip: added testfile & started on environment (??)

* wip: some todos

* wip: start of source argument resolve

* wip: rebase

* refactor: start using sync r shell implementation

* wip: some todos

* wip: use executeSingleSubStep for parsing sourced code

* wip: fix merge issues

* feat-fix: avoid cyclic dependency when using step executor

* wip: run normalize and dataflow on sourced file

* wip: some work on source dataflowing

* refactor: remove print

* refactor: clean up todos and move source to its own function

* refactor: explicitly as in processSourceCall

* refactor: damn u typescript

* feat-fix: ensure we only parse built-in source calls

* refactor: remove todo

* feat: allow overriding the source file provider

* test: start on source tests

* refactor: overhaul source providers

* refactor: generify source providers to RParseRequestProvider

* test: added test for conditional source

* refactor: properly handle missing/invalid sourced files

* wip: test for recursive sources

* feat: skip dataflow analysis for re-sourced references

* wip: add another todo

* refactor: use parse requests in dataflow processor info

* refactor: first pass of reference chain impl

* feat-fix: also catch normalize and dataflow errors

* test: finished recursive source test

* test: added test for non-constant source argument

* test: added multi-source test

* feat-fix: sourcing multiple files works correctly now

* refactor: resolve review comments

* test: reset the source provider to the default value after each describe

* test-fix: reset the source provider in the source describe instead

---------

Co-authored-by: Florian Sihler <florian.sihler@uni-ulm.de>
  • Loading branch information
Ellpeck and EagleoutIce authored Feb 7, 2024
1 parent 2a72e25 commit ec104b8
Show file tree
Hide file tree
Showing 15 changed files with 467 additions and 46 deletions.
2 changes: 1 addition & 1 deletion src/cli/repl/commands/parse.ts
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ export const parseCommand: ReplCommand = {
}).allRemainingSteps()

const config = deepMergeObject<XmlParserConfig>(DEFAULT_XML_PARSER_CONFIG, { tokenMap: await shell.tokenMap() })
const object = await xlm2jsonObject(config, result.parse)
const object = xlm2jsonObject(config, result.parse)

output.stdout(depthListToTextTree(toDepthMap(object, config), config, output.formatter))
}
Expand Down
4 changes: 2 additions & 2 deletions src/core/print/parse-printer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ function filterObject(obj: XmlBasedJson, keys: Set<string>): XmlBasedJson[] | Xm

}

export async function parseToQuads(code: string, config: QuadSerializationConfiguration, parseConfig: XmlParserConfig): Promise<string> {
const obj = await xlm2jsonObject(parseConfig, code)
export function parseToQuads(code: string, config: QuadSerializationConfiguration, parseConfig: XmlParserConfig): string{
const obj = xlm2jsonObject(parseConfig, code)
// recursively filter so that if the object contains one of the keys 'a', 'b' or 'c', all other keys are ignored
return serialize2quads(
filterObject(obj, new Set([parseConfig.attributeName, parseConfig.childrenName, parseConfig.contentName])) as XmlBasedJson,
Expand Down
4 changes: 2 additions & 2 deletions src/core/slicer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -204,11 +204,11 @@ export class SteppingSlicer<InterestedIn extends StepName | undefined = typeof L
break
case 1:
step = guardStep('normalize')
result = await executeSingleSubStep(step, this.results.parse as string, await this.shell.tokenMap(), this.hooks, this.getId)
result = executeSingleSubStep(step, this.results.parse as string, await this.shell.tokenMap(), this.hooks, this.getId)
break
case 2:
step = guardStep('dataflow')
result = executeSingleSubStep(step, this.results.normalize as NormalizedAst)
result = executeSingleSubStep(step, this.request, this.results.normalize as NormalizedAst)
break
case 3:
step = guardStep('ai')
Expand Down
2 changes: 1 addition & 1 deletion src/core/steps.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ export const STEPS_PER_FILE = {
} satisfies IStep<typeof normalize>,
'dataflow': {
description: 'Construct the dataflow graph',
processor: produceDataFlowGraph,
processor: (r, a) => produceDataFlowGraph(r, a),
required: 'once-per-file',
printer: {
[StepOutputFormat.Internal]: internalPrinter,
Expand Down
8 changes: 8 additions & 0 deletions src/dataflow/environments/environment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,14 @@ export const DefaultEnvironmentMemory = new Map<Identifier, IdentifierDefinition
definedAt: BuiltIn,
name: 'print',
nodeId: BuiltIn
}]],
['source', [{
kind: 'built-in-function',
scope: GlobalScope,
used: 'always',
definedAt: BuiltIn,
name: 'source',
nodeId: BuiltIn
}]]
])

Expand Down
14 changes: 11 additions & 3 deletions src/dataflow/extractor.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import type { NormalizedAst, ParentInformation, RAssignmentOp, RBinaryOp} from '../r-bridge'
import type {NormalizedAst, ParentInformation, RAssignmentOp, RBinaryOp, RParseRequest} from '../r-bridge'
import { requestFingerprint} from '../r-bridge'
import { RType } from '../r-bridge'
import type { DataflowInformation } from './internal/info'
import type { DataflowProcessorInformation, DataflowProcessors} from './processor'
Expand Down Expand Up @@ -48,8 +49,15 @@ const processors: DataflowProcessors<any> = {
[RType.ExpressionList]: processExpressionList,
}

export function produceDataFlowGraph<OtherInfo>(ast: NormalizedAst<OtherInfo & ParentInformation>, initialScope: DataflowScopeName = LocalScope): DataflowInformation {
return processDataflowFor<OtherInfo>(ast.ast, { completeAst: ast, activeScope: initialScope, environments: initializeCleanEnvironments(), processors: processors as DataflowProcessors<OtherInfo & ParentInformation> })
export function produceDataFlowGraph<OtherInfo>(request: RParseRequest, ast: NormalizedAst<OtherInfo & ParentInformation>, initialScope: DataflowScopeName = LocalScope): DataflowInformation {
return processDataflowFor<OtherInfo>(ast.ast, {
completeAst: ast,
activeScope: initialScope,
environments: initializeCleanEnvironments(),
processors: processors as DataflowProcessors<OtherInfo & ParentInformation>,
currentRequest: request,
referenceChain: [requestFingerprint(request)]
})
}

export function processBinaryOp<OtherInfo>(node: RBinaryOp<OtherInfo & ParentInformation>, data: DataflowProcessorInformation<OtherInfo & ParentInformation>) {
Expand Down
22 changes: 14 additions & 8 deletions src/dataflow/internal/process/functions/function-call.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import type { DataflowInformation } from '../../info'
import type { DataflowProcessorInformation} from '../../../processor'
import type {DataflowProcessorInformation} from '../../../processor'
import { processDataflowFor } from '../../../processor'
import { define, overwriteEnvironments, resolveByName } from '../../../environments'
import type { ParentInformation, RFunctionCall} from '../../../../r-bridge'
import { RType } from '../../../../r-bridge'
import {define, overwriteEnvironments, resolveByName} from '../../../environments'
import type {ParentInformation, RFunctionCall} from '../../../../r-bridge'
import { RType} from '../../../../r-bridge'
import { guard } from '../../../../util/assert'
import type { FunctionArgument } from '../../../index'
import type {FunctionArgument} from '../../../index'
import { DataflowGraph, dataflowLogger, EdgeType } from '../../../index'
import { linkArgumentsOnCall } from '../../linker'
import { LocalScope } from '../../../environments/scopes'
import {isSourceCall, processSourceCall} from './source'

export const UnnamedFunctionCallPrefix = 'unnamed-function-call-'

Expand Down Expand Up @@ -40,7 +41,6 @@ export function processFunctionCall<OtherInfo>(functionCall: RFunctionCall<Other
finalGraph.mergeWith(functionName.graph)
}


for(const arg of functionCall.arguments) {
if(arg === undefined) {
callArgs.push('empty')
Expand Down Expand Up @@ -107,13 +107,19 @@ export function processFunctionCall<OtherInfo>(functionCall: RFunctionCall<Other
inIds.push(...functionName.in, ...functionName.unknownReferences)
}

return {
let info: DataflowInformation = {
unknownReferences: [],
in: inIds,
out: functionName.out, // we do not keep argument out as it has been linked by the function
graph: finalGraph,
environments: finalEnv,
scope: data.activeScope
}
}

// parse a source call and analyze the referenced code
if(isSourceCall(functionCallName, data.activeScope,finalEnv)) {
info = processSourceCall(functionCall, data, info)
}

return info
}
80 changes: 80 additions & 0 deletions src/dataflow/internal/process/functions/source.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import type {IdGenerator, NoInfo, RArgument, RParseRequest, RParseRequestProvider} from '../../../../r-bridge'
import { requestFingerprint} from '../../../../r-bridge'
import { sourcedDeterministicCountingIdGenerator} from '../../../../r-bridge'
import {requestProviderFromFile} from '../../../../r-bridge'
import {type NormalizedAst, type ParentInformation, removeTokenMapQuotationMarks, type RFunctionCall, RType} from '../../../../r-bridge'
import {RShellExecutor} from '../../../../r-bridge/shell-executor'
import {executeSingleSubStep} from '../../../../core'
import {type DataflowProcessorInformation, processDataflowFor} from '../../../processor'
import {type DataflowScopeName, type Identifier, overwriteEnvironments, type REnvironmentInformation, resolveByName} from '../../../environments'
import type {DataflowInformation} from '../../info'
import {dataflowLogger} from '../../../index'

let sourceProvider = requestProviderFromFile()

export function setSourceProvider(provider: RParseRequestProvider): void {
sourceProvider = provider
}

export function isSourceCall(name: Identifier, scope: DataflowScopeName, environments: REnvironmentInformation): boolean {
const definitions = resolveByName(name, scope, environments)
if(definitions === undefined) {
return false
}
// fail if there are multiple definitions because then we must treat the complete import as a maybe because it might do something different
if(definitions.length !== 1) {
return false
}
const def = definitions[0]
return def.name == 'source' && def.kind == 'built-in-function'
}

export function processSourceCall<OtherInfo>(functionCall: RFunctionCall<OtherInfo & ParentInformation>, data: DataflowProcessorInformation<OtherInfo & ParentInformation>, information: DataflowInformation): DataflowInformation {
const sourceFile = functionCall.arguments[0] as RArgument<ParentInformation> | undefined
if(sourceFile?.value?.type == RType.String) {
const path = removeTokenMapQuotationMarks(sourceFile.lexeme)
const request = sourceProvider.createRequest(path)

// check if the sourced file has already been dataflow analyzed, and if so, skip it
if(data.referenceChain.includes(requestFingerprint(request))) {
dataflowLogger.info(`Found loop in dataflow analysis for ${JSON.stringify(request)}: ${JSON.stringify(data.referenceChain)}, skipping further dataflow analysis`)
return information
}

return sourceRequest(request, data, information, sourcedDeterministicCountingIdGenerator(path, functionCall.location))
} else {
dataflowLogger.info(`Non-constant argument ${JSON.stringify(sourceFile)} for source is currently not supported, skipping`)
return information
}
}

export function sourceRequest<OtherInfo>(request: RParseRequest, data: DataflowProcessorInformation<OtherInfo & ParentInformation>, information: DataflowInformation, getId: IdGenerator<NoInfo>): DataflowInformation {
const executor = new RShellExecutor()

// parse, normalize and dataflow the sourced file
let normalized: NormalizedAst<OtherInfo & ParentInformation>
let dataflow: DataflowInformation
try {
const parsed = executeSingleSubStep('parse', request, executor) as string
normalized = executeSingleSubStep('normalize', parsed, executor.getTokenMap(), undefined, getId) as NormalizedAst<OtherInfo & ParentInformation>
dataflow = processDataflowFor(normalized.ast, {
...data,
currentRequest: request,
environments: information.environments,
referenceChain: [...data.referenceChain, requestFingerprint(request)]
})
} catch(e) {
dataflowLogger.warn(`Failed to analyze sourced file ${JSON.stringify(request)}, skipping: ${(e as Error).message}`)
return information
}

// update our graph with the sourced file's information
const newInformation = {...information}
newInformation.environments = overwriteEnvironments(information.environments, dataflow.environments)
newInformation.graph.mergeWith(dataflow.graph)
// this can be improved, see issue #628
for(const [k, v] of normalized.idMap) {
data.completeAst.idMap.set(k, v)
}
return newInformation
}
22 changes: 14 additions & 8 deletions src/dataflow/processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import type {
NormalizedAst,
ParentInformation, RNode,
RNodeWithParent
RNodeWithParent, RParseRequest
} from '../r-bridge'
import type { DataflowInformation } from './internal/info'
import type { DataflowScopeName, REnvironmentInformation } from './environments'
Expand All @@ -13,20 +13,29 @@ export interface DataflowProcessorInformation<OtherInfo> {
/**
* Initial and frozen ast-information
*/
readonly completeAst: NormalizedAst<OtherInfo>
readonly completeAst: NormalizedAst<OtherInfo>
/**
* Correctly contains pushed local scopes introduced by `function` scopes.
* Will by default *not* contain any symbol-bindings introduces along the way, they have to be decorated when moving up the tree.
*/
readonly environments: REnvironmentInformation
readonly environments: REnvironmentInformation
/**
* Name of the currently active scope, (hopefully) always {@link LocalScope | Local}
*/
readonly activeScope: DataflowScopeName
readonly activeScope: DataflowScopeName
/**
* Other processors to be called by the given functions
*/
readonly processors: DataflowProcessors<OtherInfo>
readonly processors: DataflowProcessors<OtherInfo>
/**
* The {@link RParseRequest} that is currently being parsed
*/
readonly currentRequest: RParseRequest
/**
* The chain of {@link RParseRequest} fingerprints ({@link requestFingerprint}) that lead to the {@link currentRequest}.
* The most recent (last) entry is expected to always be the {@link currentRequest}.
*/
readonly referenceChain: string[]
}

export type DataflowProcessor<OtherInfo, NodeType extends RNodeWithParent<OtherInfo>> = (node: NodeType, data: DataflowProcessorInformation<OtherInfo>) => DataflowInformation
Expand Down Expand Up @@ -55,6 +64,3 @@ export type DataflowProcessors<OtherInfo> = {
export function processDataflowFor<OtherInfo>(current: RNodeWithParent<OtherInfo>, data: DataflowProcessorInformation<OtherInfo & ParentInformation>): DataflowInformation {
return data.processors[current.type](current as never, data)
}



5 changes: 5 additions & 0 deletions src/r-bridge/lang-4.x/ast/model/processing/decorate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ export function deterministicCountingIdGenerator(start = 0): () => NodeId {
return () => `${id++}`
}

export function sourcedDeterministicCountingIdGenerator(path: string, location: SourceRange, start = 0): () => NodeId {
let id = start
return () => `${path}-${loc2Id(location)}-${id++}`
}

function loc2Id(loc: SourceRange) {
return `${loc.start.line}:${loc.start.column}-${loc.end.line}:${loc.end.column}`
}
Expand Down
10 changes: 7 additions & 3 deletions src/r-bridge/lang-4.x/ast/parser/xml/internal/xml-to-json.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ import type { XmlBasedJson } from '../input-format'
* @param config - The configuration to use (i.e., what names should be used for the attributes, children, ...)
* @param xmlString - The xml input to parse
*/
export function xlm2jsonObject(config: XmlParserConfig, xmlString: string): Promise<XmlBasedJson> {
return xml2js.parseStringPromise(xmlString, {
export function xlm2jsonObject(config: XmlParserConfig, xmlString: string): XmlBasedJson {
let result: XmlBasedJson = {}
xml2js.parseString(xmlString, {
// we want this to be strictly synchronous so that the result can be returned immediately below!
async: false,
attrkey: config.attributeName,
charkey: config.contentName,
childkey: config.childrenName,
Expand All @@ -22,5 +25,6 @@ export function xlm2jsonObject(config: XmlParserConfig, xmlString: string): Prom
includeWhiteChars: true,
normalize: false,
strict: true
}) as Promise<XmlBasedJson>
}, (_, r)=> result = r as XmlBasedJson)
return result
}
4 changes: 2 additions & 2 deletions src/r-bridge/lang-4.x/ast/parser/xml/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ export const parseLog = log.getSubLogger({ name: 'ast-parser' })
*
* @returns The normalized and decorated AST (i.e., as a doubly linked tree)
*/
export async function normalize(xmlString: string, tokenMap: TokenMap, hooks?: DeepPartial<XmlParserHooks>, getId: IdGenerator<NoInfo> = deterministicCountingIdGenerator(0)): Promise<NormalizedAst> {
export function normalize(xmlString: string, tokenMap: TokenMap, hooks?: DeepPartial<XmlParserHooks>, getId: IdGenerator<NoInfo> = deterministicCountingIdGenerator(0)): NormalizedAst {
const config = { ...DEFAULT_XML_PARSER_CONFIG, tokenMap }
const hooksWithDefaults = deepMergeObject(DEFAULT_PARSER_HOOKS, hooks) as XmlParserHooks

const data: ParserData = { config, hooks: hooksWithDefaults, currentRange: undefined, currentLexeme: undefined }
const object = await xlm2jsonObject(config, xmlString)
const object = xlm2jsonObject(config, xmlString)

return decorateAst(parseRootObjToAst(data, object), getId)
}
Loading

2 comments on commit ec104b8

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"artificial" Benchmark Suite

Benchmark suite Current: ec104b8 Previous: d69018e Ratio
Total per-file 1512.2355823181817 ms (3694.360896380434) 1511.494083 ms (3708.3225553463017) 1.00
Retrieve AST from R code 63.72869486363637 ms (123.81549012137785) 64.46248863636363 ms (125.66414120100016) 0.99
Normalize R AST 94.4223031818182 ms (153.7545470057254) 94.99519236363636 ms (152.9376581920758) 0.99
Produce dataflow information 67.760007 ms (172.61951563539108) 65.2556795909091 ms (167.18441854609554) 1.04
Run abstract interpretation 0.036419181818181816 ms (0.018520681738474185) 0.03478995454545455 ms (0.01086253065474186) 1.05
Total per-slice 1.9578501304386124 ms (1.4212083640995328) 1.8724288794806876 ms (1.3873679811565907) 1.05
Static slicing 1.494833880817364 ms (1.3461684991098852) 1.4074784311593942 ms (1.3118563756339259) 1.06
Reconstruct code 0.44712880173053754 ms (0.21014848413193918) 0.4524929302663976 ms (0.22636683004337768) 0.99
failed to reconstruct/re-parse 0 # 0 # 1
times hit threshold 0 # 0 # 1
reduction (characters) 0.7329390759026896 # 0.7329390759026896 # 1
reduction (normalized tokens) 0.720988345209971 # 0.720988345209971 # 1

This comment was automatically generated by workflow using github-action-benchmark.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"social-science" Benchmark Suite

Benchmark suite Current: ec104b8 Previous: d69018e Ratio
Total per-file 3780.83924088 ms (6262.620791394061) 3566.86774236 ms (5920.286185213901) 1.06
Retrieve AST from R code 72.47843676000001 ms (60.60667043532563) 72.22227936 ms (60.97026629229811) 1.00
Normalize R AST 112.02863244 ms (69.56951604022248) 113.02594858 ms (70.71306906384982) 0.99
Produce dataflow information 182.53878808000002 ms (284.4251735862355) 163.44175874 ms (276.9623037407309) 1.12
Run abstract interpretation 0.05759796 ms (0.02710947569390452) 0.06162254 ms (0.02971958909151336) 0.93
Total per-slice 9.230454502244271 ms (15.112408308435244) 8.599255365044066 ms (14.312877376595168) 1.07
Static slicing 8.704662440375671 ms (14.982894834915669) 8.071953766135923 ms (14.188089279803133) 1.08
Reconstruct code 0.5164380502435043 ms (0.26249066514714825) 0.5187709959800451 ms (0.27627204677573897) 1.00
failed to reconstruct/re-parse 9 # 9 # 1
times hit threshold 967 # 967 # 1
reduction (characters) 0.898713819973478 # 0.898713819973478 # 1
reduction (normalized tokens) 0.8579790415512589 # 0.8579790415512589 # 1

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.