Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Query API] Script dependencies #1066

Merged
merged 19 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/cli/repl/commands/repl-query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ import { splitAtEscapeSensitive } from '../../../util/args';
import { italic } from '../../../util/ansi';
import { describeSchema } from '../../../util/schema';
import type { Query, QueryResults, SupportedQueryTypes } from '../../../queries/query';
import { executeQueries } from '../../../queries/query';

import { executeQueries } from '../../../queries/query';
import type { PipelineOutput } from '../../../core/steps/pipeline/pipeline';
import { jsonReplacer } from '../../../util/json';
import { AnyQuerySchema, QueriesSchema } from '../../../queries/query-schema';


async function getDataflow(shell: RShell, remainingLine: string) {
return await new PipelineExecutor(DEFAULT_DATAFLOW_PIPELINE, {
shell,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import { RType } from '../../../../../../r-bridge/lang-4.x/ast/model/type';
import { overwriteEnvironment } from '../../../../../environments/overwrite';
import type { NoInfo } from '../../../../../../r-bridge/lang-4.x/ast/model/model';
import { expensiveTrace } from '../../../../../../util/log';
import fs from 'fs';

let sourceProvider = requestProviderFromFile();

Expand Down Expand Up @@ -76,6 +77,14 @@ export function processSourceCall<OtherInfo>(
}

export function sourceRequest<OtherInfo>(rootId: NodeId, request: RParseRequest, data: DataflowProcessorInformation<OtherInfo & ParentInformation>, information: DataflowInformation, getId: IdGenerator<NoInfo>): DataflowInformation {
if(request.request === 'file') {
/* check if the file exists and if not, fail */
if(!fs.existsSync(request.content)) {
dataflowLogger.warn(`Failed to analyze sourced file ${JSON.stringify(request)}: file does not exist`);
information.graph.markIdForUnknownSideEffects(rootId);
return information;
}
}
const executor = new RShellExecutor();

// parse, normalize and dataflow the sourced file
Expand Down
1 change: 0 additions & 1 deletion src/documentation/doc-util/doc-query.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import type { RShell } from '../../r-bridge/shell';
import type { Queries, QueryResults, SupportedQueryTypes } from '../../queries/query';
import { SupportedQueries , executeQueries } from '../../queries/query';

import { PipelineExecutor } from '../../core/pipeline-executor';
import { DEFAULT_DATAFLOW_PIPELINE } from '../../core/steps/pipeline/default-pipelines';
import { requestFromInput } from '../../r-bridge/retriever';
Expand Down
128 changes: 128 additions & 0 deletions src/queries/catalog/dependencies-query/dependencies-query-executor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import type { BasicQueryData } from '../../query';
import { executeQueries } from '../../query';
import type {
DependenciesQuery,
DependenciesQueryResult, DependencyInfo,
FunctionInfo,
LibraryInfo,
ReadInfo, SourceInfo,
WriteInfo
} from './dependencies-query-format';
import { LibraryFunctions, ReadFunctions, SourceFunctions, WriteFunctions } from './dependencies-query-format';
import type { CallContextQuery, CallContextQueryResult } from '../call-context-query/call-context-query-format';
import type { DataflowGraphVertexFunctionCall } from '../../../dataflow/graph/vertex';
import { getReferenceOfArgument } from '../../../dataflow/graph/graph';
import { log } from '../../../util/log';
import { RType } from '../../../r-bridge/lang-4.x/ast/model/type';
import { removeRQuotes } from '../../../r-bridge/retriever';
import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call';
import type { NodeId } from '../../../r-bridge/lang-4.x/ast/model/processing/node-id';

const SupportedVertexTypes = [RType.String, RType.Logical, RType.Number];

const Unknown = 'unknown';

export function executeDependenciesQuery(data: BasicQueryData, queries: readonly DependenciesQuery[]): DependenciesQueryResult {
if(queries.length !== 1) {
log.warn('Dependencies query expects only up to one query, but got ', queries.length);
}
const now = Date.now();

const query = queries[0];
const ignoreDefault = query.ignoreDefaultFunctions ?? false;
const libraryFunctions = getFunctionsToCheck(query.libraryFunctions, ignoreDefault, LibraryFunctions);
const sourceFunctions = getFunctionsToCheck(query.sourceFunctions, ignoreDefault, SourceFunctions);
const readFunctions = getFunctionsToCheck(query.readFunctions, ignoreDefault, ReadFunctions);
const writeFunctions = getFunctionsToCheck(query.writeFunctions, ignoreDefault, WriteFunctions);

const results = executeQueries(data, [
...makeCallContextQuery(libraryFunctions, 'library'),
...makeCallContextQuery(sourceFunctions, 'source'),
...makeCallContextQuery(readFunctions, 'read'),
...makeCallContextQuery(writeFunctions, 'write')
])['call-context'];

const libraries: LibraryInfo[] = getResults(data, results, 'library', libraryFunctions, (id, vertex, argument) => ({
nodeId: id,
functionName: vertex.name,
libraryName: argument ?? Unknown
}), [RType.Symbol]);
const sourcedFiles: SourceInfo[] = getResults(data, results, 'source', sourceFunctions, (id, vertex, argument) => ({
nodeId: id,
functionName: vertex.name,
file: argument ?? Unknown
}));
const readData: ReadInfo[] = getResults(data, results, 'read', readFunctions, (id, vertex, argument) => ({
nodeId: id,
functionName: vertex.name,
source: argument ?? Unknown
}));
const writtenData: WriteInfo[] = getResults(data, results, 'write', writeFunctions, (id, vertex, argument) => ({
nodeId: id,
functionName: vertex.name,
// write functions that don't have argIndex are assumed to write to stdout
destination: argument ?? 'stdout'
}));

return {
'.meta': {
timing: Date.now() - now
},
libraries, sourcedFiles, readData, writtenData
};
}

function makeCallContextQuery(functions: readonly FunctionInfo[], kind: string): CallContextQuery[] {
return functions.map(f => ({
type: 'call-context',
callName: f.name,
includeAliases: true,
callNameExact: true,
subkind: f.name,
kind
}));
}

function getResults<T extends DependencyInfo>(data: BasicQueryData, results: CallContextQueryResult, kind: string, functions: FunctionInfo[], makeInfo: (id: NodeId, vertex: DataflowGraphVertexFunctionCall, argument: string | undefined) => T | undefined, additionalAllowedTypes?: RType[]) {
return Object.entries(results?.kinds[kind]?.subkinds ?? {}).flatMap(([name, results]) => results.map(({ id }) => {
const vertex = data.graph.getVertex(id) as DataflowGraphVertexFunctionCall;
const info = functions.find(f => f.name === name) as FunctionInfo;
let index = info.argIdx;
if(info.argName) {
const arg = vertex?.args.findIndex(arg => arg !== EmptyArgument && arg.name === info.argName);
if(arg >= 0) {
index = arg;
}
}
const argument = index !== undefined ? getArgumentValue(data, vertex, index, additionalAllowedTypes) : undefined;
return makeInfo(id, vertex, argument);
})).filter(x => x !== undefined) ?? [];
}

function getArgumentValue({ graph }: BasicQueryData, vertex: DataflowGraphVertexFunctionCall, argumentIndex: number, additionalAllowedTypes: RType[] | undefined): string | undefined {
if(vertex && vertex.args.length > argumentIndex) {
const arg = getReferenceOfArgument(vertex.args[argumentIndex]);
if(arg) {
let valueNode = graph.idMap?.get(arg);
if(valueNode?.type === RType.Argument) {
valueNode = valueNode.value;
}
if(valueNode) {
const allowedTypes = [...SupportedVertexTypes, ...additionalAllowedTypes ?? []];
return allowedTypes.includes(valueNode.type) ? removeRQuotes(valueNode.lexeme as string) : Unknown;
}
}
}
return undefined;
}

function getFunctionsToCheck(customFunctions: FunctionInfo[] | undefined, ignoreDefaultFunctions: boolean, defaultFunctions: FunctionInfo[]): FunctionInfo[] {
const functions: FunctionInfo[] = [];
if(!ignoreDefaultFunctions) {
functions.push(...defaultFunctions);
}
if(customFunctions) {
functions.push(...customFunctions);
}
return functions;
}
129 changes: 129 additions & 0 deletions src/queries/catalog/dependencies-query/dependencies-query-format.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import type { BaseQueryFormat, BaseQueryResult } from '../../base-query-format';
import type { NodeId } from '../../../r-bridge/lang-4.x/ast/model/processing/node-id';
import type { QueryResults, SupportedQuery } from '../../query';
import { bold } from '../../../util/ansi';
import { printAsMs } from '../../../util/time';
import Joi from 'joi';
import { executeDependenciesQuery } from './dependencies-query-executor';

// these lists are originally based on https://github.com/duncantl/CodeDepends/blob/7fd96dfee16b252e5f642c77a7ababf48e9326f8/R/codeTypes.R
export const LibraryFunctions: FunctionInfo[] = [
{ name: 'library', argIdx: 0, argName: 'package' },
{ name: 'require', argIdx: 0, argName: 'package' },
{ name: 'loadNamespace', argIdx: 0, argName: 'package' },
{ name: 'attachNamespace', argIdx: 0, argName: 'ns' },
] as const;
export const SourceFunctions: FunctionInfo[] = [
{ name: 'source', argIdx: 0, argName: 'file' }
] as const;
export const ReadFunctions: FunctionInfo[] = [
{ name: 'read.table', argIdx: 0, argName: 'file' },
{ name: 'read.csv', argIdx: 0, argName: 'file' },
{ name: 'read.csv2', argIdx: 0, argName: 'file' },
{ name: 'read.delim', argIdx: 0, argName: 'file' },
{ name: 'read.delim', argIdx: 0, argName: 'file' },
{ name: 'read.fwf', argIdx: 0, argName: 'file' },
{ name: 'file', argIdx: 1, argName: 'open' },
{ name: 'url', argIdx: 1, argName: 'open' },
{ name: 'load', argIdx: 0, argName: 'file' },
{ name: 'gzfile', argIdx: 1, argName: 'open' },
{ name: 'bzfile', argIdx: 1, argName: 'open' },
{ name: 'download.file', argIdx: 0, argName: 'url' },
{ name: 'pipe', argIdx: 1, argName: 'open' },
{ name: 'fifo', argIdx: 1, argName: 'open' },
{ name: 'unz', argIdx: 1, argName: 'open' },
{ name: 'matrix', argIdx: 0, argName: 'data' },
{ name: 'readRDS', argIdx: 0, argName: 'file' },
{ name: 'readLines', argIdx: 0, argName: 'con' },
] as const;
export const WriteFunctions: FunctionInfo[] = [
{ name: 'save', argIdx: 0, argName: '...' },
{ name: 'save.image', argIdx: 0, argName: 'file' },
{ name: 'write', argIdx: 1, argName: 'file' },
{ name: 'dput', argIdx: 1, argName: 'file' },
{ name: 'dump', argIdx: 1, argName: 'file' },
{ name: 'write.table', argIdx: 1, argName: 'file' },
{ name: 'write.csv', argIdx: 1, argName: 'file' },
{ name: 'saveRDS', argIdx: 1, argName: 'file' },
// write functions that don't have argIndex are assumed to write to stdout
{ name: 'print' },
{ name: 'cat' },
] as const;

export interface FunctionInfo {
name: string
argIdx?: number
argName?: string
}

export interface DependenciesQuery extends BaseQueryFormat {
readonly type: 'dependencies'
readonly ignoreDefaultFunctions?: boolean
readonly libraryFunctions?: FunctionInfo[]
readonly sourceFunctions?: FunctionInfo[]
readonly readFunctions?: FunctionInfo[]
readonly writeFunctions?: FunctionInfo[]
}

export interface DependenciesQueryResult extends BaseQueryResult {
libraries: LibraryInfo[]
sourcedFiles: SourceInfo[]
readData: ReadInfo[]
writtenData: WriteInfo[]
}

export interface DependencyInfo {
nodeId: NodeId
functionName: string
}
export type LibraryInfo = (DependencyInfo & { libraryName: 'unknown' | string })
export type SourceInfo = (DependencyInfo & { file: string })
export type ReadInfo = (DependencyInfo & { source: string })
export type WriteInfo = (DependencyInfo & { destination: 'stdout' | string })

function printResultSection<T extends DependencyInfo>(title: string, infos: T[], result: string[], sectionSpecifics: (info: T) => string): void {
if(infos.length <= 0) {
return;
}
result.push(` ╰ ${title}`);
const grouped = infos.reduce(function(groups: Map<string, T[]>, i) {
const array = groups.get(i.functionName);
if(array) {
array.push(i);
} else {
groups.set(i.functionName, [i]);
}
return groups;
}, new Map<string, T[]>());
for(const [functionName, infos] of grouped) {
result.push(` ╰ ${functionName}`);
result.push(infos.map(i => ` ╰ Node Id: ${i.nodeId}, ${sectionSpecifics(i)}`).join('\n'));
}
}

const functionInfoSchema: Joi.ArraySchema = Joi.array().items(Joi.object({
name: Joi.string().required().description('The name of the library function.'),
argIdx: Joi.number().optional().description('The index of the argument that contains the library name.'),
argName: Joi.string().optional().description('The name of the argument that contains the library name.'),
})).optional();

export const DependenciesQueryDefinition = {
executor: executeDependenciesQuery,
asciiSummarizer: (formatter, _processed, queryResults, result) => {
const out = queryResults as QueryResults<'dependencies'>['dependencies'];
result.push(`Query: ${bold('dependencies', formatter)} (${printAsMs(out['.meta'].timing, 0)})`);
printResultSection('Libraries', out.libraries, result, l => `Library Name: ${l.libraryName}`);
printResultSection('Sourced Files', out.sourcedFiles, result, s => `Sourced File: ${s.file}`);
printResultSection('Read Data', out.readData, result, r => `Source: ${r.source}`);
printResultSection('Written Data', out.writtenData, result, w => `Destination: ${w.destination}`);
return true;
},
schema: Joi.object({
type: Joi.string().valid('dependencies').required().description('The type of the query.'),
ignoreDefaultFunctions: Joi.boolean().optional().description('Should the set of functions that are detected by default be ignored/skipped?'),
libraryFunctions: functionInfoSchema.description('The set of library functions to search for.'),
sourceFunctions: functionInfoSchema.description('The set of source functions to search for.'),
readFunctions: functionInfoSchema.description('The set of data reading functions to search for.'),
writeFunctions: functionInfoSchema.description('The set of data writing functions to search for.'),
}).description('The dependencies query retrieves and returns the set of all dependencies in the dataflow graph, which includes libraries, sourced files, read data, and written data.')
} as const satisfies SupportedQuery<'dependencies'>;
4 changes: 2 additions & 2 deletions src/queries/catalog/lineage-query/lineage-query-format.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ export const LineageQueryDefinition = {
return true;
},
schema: Joi.object({
type: Joi.string().valid('lineage').required().description('The type of the query.'),
id: Joi.string().required().description('The ID of the node to get the lineage of.')
type: Joi.string().valid('lineage').required().description('The type of the query.'),
criterion: Joi.string().required().description('The slicing criterion of the node to get the lineage of.')
}).description('Lineage query used to find the lineage of a node in the dataflow graph')
} as const satisfies SupportedQuery<'lineage'>;
22 changes: 13 additions & 9 deletions src/queries/query.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import type { CallContextQuery } from './catalog/call-context-query/call-context-query-format';
import type {
CallContextQuery
} from './catalog/call-context-query/call-context-query-format';
import { CallContextQueryDefinition } from './catalog/call-context-query/call-context-query-format';

import type { DataflowGraph } from '../dataflow/graph/graph';
import type { BaseQueryFormat, BaseQueryResult } from './base-query-format';
import { guard } from '../util/assert';
Expand All @@ -15,18 +16,20 @@ import type { IdMapQuery } from './catalog/id-map-query/id-map-query-format';
import { IdMapQueryDefinition } from './catalog/id-map-query/id-map-query-format';
import type { NormalizedAstQuery } from './catalog/normalized-ast-query/normalized-ast-query-format';
import { NormalizedAstQueryDefinition } from './catalog/normalized-ast-query/normalized-ast-query-format';
import type { DataflowClusterQuery } from './catalog/cluster-query/cluster-query-format';
import { ClusterQueryDefinition } from './catalog/cluster-query/cluster-query-format';
import type { StaticSliceQuery } from './catalog/static-slice-query/static-slice-query-format';
import { StaticSliceQueryDefinition } from './catalog/static-slice-query/static-slice-query-format';
import type { LineageQuery } from './catalog/lineage-query/lineage-query-format';
import { LineageQueryDefinition } from './catalog/lineage-query/lineage-query-format';
import { type OutputFormatter } from '../util/ansi';
import type { StaticSliceQuery } from './catalog/static-slice-query/static-slice-query-format';
import { StaticSliceQueryDefinition } from './catalog/static-slice-query/static-slice-query-format';
import type { DataflowClusterQuery } from './catalog/cluster-query/cluster-query-format';
import { ClusterQueryDefinition } from './catalog/cluster-query/cluster-query-format';
import type { DependenciesQuery } from './catalog/dependencies-query/dependencies-query-format';
import { DependenciesQueryDefinition } from './catalog/dependencies-query/dependencies-query-format';
import type { OutputFormatter } from '../util/ansi';
import type { PipelineOutput } from '../core/steps/pipeline/pipeline';
import type { DEFAULT_DATAFLOW_PIPELINE } from '../core/steps/pipeline/default-pipelines';
import type Joi from 'joi';

export type Query = CallContextQuery | DataflowQuery | NormalizedAstQuery | IdMapQuery | DataflowClusterQuery | StaticSliceQuery | LineageQuery;
export type Query = CallContextQuery | DataflowQuery | NormalizedAstQuery | IdMapQuery | DataflowClusterQuery | StaticSliceQuery | LineageQuery | DependenciesQuery;

export type QueryArgumentsWithType<QueryType extends BaseQueryFormat['type']> = Query & { type: QueryType };

Expand Down Expand Up @@ -55,7 +58,8 @@ export const SupportedQueries = {
'normalized-ast': NormalizedAstQueryDefinition,
'dataflow-cluster': ClusterQueryDefinition,
'static-slice': StaticSliceQueryDefinition,
'lineage': LineageQueryDefinition
'lineage': LineageQueryDefinition,
'dependencies': DependenciesQueryDefinition
} as const satisfies SupportedQueries;

export type SupportedQueryTypes = keyof typeof SupportedQueries;
Expand Down
Loading