Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Normalize SQL IN(?, ?, ...) statements to "in(?)" to reduce cardinality of db.statement attribute #10564

Merged
merged 6 commits into from
Mar 12, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

package io.opentelemetry.instrumentation.api.incubator.semconv.db;

import java.util.regex.Pattern;

%%

%final
Expand Down Expand Up @@ -52,6 +54,10 @@ WHITESPACE = [ \t\r\n]+
// max length of the sanitized statement - SQLs longer than this will be trimmed
static final int LIMIT = 32 * 1024;

// Match on "IN(?, ?, ...)"
private static final Pattern IN_STATEMENT_PATTERN = Pattern.compile("(\\sin\\s*)\\(\\s*\\?\\s*(,\\s*\\?\\s*)*+\\)", Pattern.CASE_INSENSITIVE);
private static final String IN_STATEMENT_NORMALIZED = " in(?)";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
private static final String IN_STATEMENT_NORMALIZED = " in(?)";
private static final String IN_STATEMENT_NORMALIZED = "$1(?)";

Sanitizer does not change case or remove whitespace from the original query. Lets keep in as it was in the original query. longInStatementDoesntCauseStackOverflow will break after this change as there is a space between in and ( that is currently removed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, I updated it to preserve case and whitespace, and updated the test cases to check for that. I didn't add a test case for more than one space between IN and ( since strings like IN (?) get sanitized to IN (?)

Also switched to a non-capturing group for matching on the part in-between the brackets as a small optimization


private final StringBuilder builder = new StringBuilder();

private void appendCurrentFragment() {
Expand Down Expand Up @@ -278,7 +284,11 @@ WHITESPACE = [ \t\r\n]+
builder.delete(LIMIT, builder.length());
}
String fullStatement = builder.toString();
return operation.getResult(fullStatement);

// Normalize all 'in (?, ?, ...)' statements to in (?) to reduce cardinality
String normalizedStatement = IN_STATEMENT_PATTERN.matcher(fullStatement).replaceAll(IN_STATEMENT_NORMALIZED);

return operation.getResult(normalizedStatement);
}

%}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,19 @@ void randomBytesDontCauseExceptionsOrTimeouts() {
}
}

@Test
public void longInStatementDoesntCauseStackOverflow() {
StringBuilder s = new StringBuilder("select col from table where col in (");
for (int i = 0; i < 10000; i++) {
s.append("?,");
}
s.append("?)");

String sanitized = SqlStatementSanitizer.create(true).sanitize(s.toString()).getFullStatement();

assertThat(sanitized).isEqualTo("select col from table where col in(?)");
}

static class SqlArgs implements ArgumentsProvider {

@Override
Expand Down Expand Up @@ -271,7 +284,11 @@ public Stream<? extends Arguments> provideArguments(ExtensionContext context) th
Arguments.of("select col from table1 as t1, table2 as t2", expect("SELECT", null)),
Arguments.of(
"select col from table where col in (1, 2, 3)",
expect("select col from table where col in (?, ?, ?)", "SELECT", "table")),
expect("select col from table where col in(?)", "SELECT", "table")),
Arguments.of(
"select 'a' IN(x, 'b') from table where col in(1) and z IN( '3', '4' )",
expect(
"select ? IN(x, ?) from table where col in(?) and z in(?)", "SELECT", "table")),
Arguments.of("select col from table order by col, col2", expect("SELECT", "table")),
Arguments.of("select ąś∂ń© from źćļńĶ order by col, col2", expect("SELECT", "źćļńĶ")),
Arguments.of("select 12345678", expect("select ?", "SELECT", null)),
Expand All @@ -298,6 +315,9 @@ public Stream<? extends Arguments> provideArguments(ExtensionContext context) th
"delete from `my table` where something something", expect("DELETE", "my table")),
Arguments.of(
"delete from \"my table\" where something something", expect("DELETE", "my table")),
Arguments.of(
"delete from foo where x IN(1, 2, 3)",
expect("delete from foo where x in(?)", "DELETE", "foo")),
Arguments.of("delete from 12345678", expect("delete from ?", "DELETE", null)),
Arguments.of("delete (((", expect("delete (((", "DELETE", null)),

Expand All @@ -307,6 +327,9 @@ public Stream<? extends Arguments> provideArguments(ExtensionContext context) th
Arguments.of(
"update `my table` set answer=42",
expect("update `my table` set answer=?", "UPDATE", "my table")),
Arguments.of(
"update `my table` set answer=42 where x IN('a', 'b')",
expect("update `my table` set answer=? where x in(?)", "UPDATE", "my table")),
Arguments.of(
"update \"my table\" set answer=42",
expect("update \"my table\" set answer=?", "UPDATE", "my table")),
Expand Down
Loading