Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW PURPOSE] Add PARSE_URL implementation in Java #20

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@

import io.netty.util.internal.PlatformDependent;

import java.net.URL;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class StringFunctionHelpers {
static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(StringFunctionHelpers.class);

Expand Down Expand Up @@ -221,6 +226,59 @@ public static long getDate(ArrowBuf buf, int start, int end){
return CHRONOLOGY.getDateTimeMillis(dateFields[0], dateFields[1], dateFields[2], 0);
}

public static Optional<String> parseURL(String urlStr, String partToExtract, FunctionErrorContext errCtx){
final URL url;
try {
url = new URL(urlStr);
} catch (Exception e) {
return Optional.empty();
}

if (partToExtract.equals("HOST")) {
return Optional.ofNullable(url.getHost());
}
if (partToExtract.equals("PATH")) {
return Optional.ofNullable(url.getPath());
}
if (partToExtract.equals("QUERY")) {
return Optional.ofNullable(url.getQuery());
}
if (partToExtract.equals("REF")) {
return Optional.ofNullable(url.getRef());
}
if (partToExtract.equals("PROTOCOL")) {
return Optional.ofNullable(url.getProtocol());
}
if (partToExtract.equals("FILE")) {
return Optional.ofNullable(url.getFile());
}
if (partToExtract.equals("AUTHORITY")) {
return Optional.ofNullable(url.getAuthority());
}
if (partToExtract.equals("USERINFO")) {
return Optional.ofNullable(url.getUserInfo());
}

return Optional.empty();
}

public static Optional<String> parseURLQueryKey(String urlStr, String partToExtract, Pattern keyPattern, FunctionErrorContext errCtx){
if (!partToExtract.equals("QUERY")) {
return Optional.empty();
}
Optional<String> query = parseURL(urlStr, partToExtract, errCtx);
if (!query.isPresent()){
return Optional.empty();
}

Matcher m = keyPattern.matcher(query.get());
if (m.find()) {
return Optional.ofNullable(m.group(2));
}

return Optional.empty();
}

/**
* Takes a string value, specified as a buffer with a start and end and
* returns true if the value can be read as a date.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import static com.dremio.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder;

import java.nio.charset.Charset;
import java.util.Optional;
import java.util.regex.Pattern;

import javax.inject.Inject;

Expand Down Expand Up @@ -1717,4 +1719,85 @@ public void eval() {
out.end = outBytea.length;
}
}

@FunctionTemplate(name = "parse_url", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL)
public static class ParseURL implements SimpleFunction{
@Param VarCharHolder in;
@Param(constant = true) VarCharHolder partToExtract;
@Output NullableVarCharHolder out;
@Inject ArrowBuf buffer;

@Inject FunctionErrorContext errCtx;

@Workspace String urlPart;

@Override
public void setup() {
urlPart = com.dremio.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(partToExtract.start,
partToExtract.end, partToExtract.buffer);
}

@Override
public void eval() {
String url = com.dremio.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(in.start, in.end, in.buffer);
Optional<String> extractPart = com.dremio.exec.expr.fn.impl.StringFunctionHelpers.parseURL(url, urlPart, errCtx);

extractPart.ifPresent(val ->{
out.isSet = 1;
byte[] buf = val.getBytes();
buffer.setBytes(0, buf);

out.start = 0;
out.end = buf.length;
out.buffer = buffer;
});
}
}

@FunctionTemplate(name = "parse_url", scope = FunctionScope.SIMPLE, nulls = NullHandling.INTERNAL)
public static class ParseURLQueryKey implements SimpleFunction{
@Param VarCharHolder in;
@Param(constant = true) VarCharHolder partToExtract;
@Param VarCharHolder queryKey;
@Output NullableVarCharHolder out;
@Inject ArrowBuf buffer;
@Inject FunctionErrorContext errCtx;

@Workspace String urlPart;
@Workspace String lastKey;
@Workspace Pattern pattern;

@Override
public void setup() {
lastKey = com.dremio.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(queryKey.start, queryKey.end,
queryKey.buffer);
pattern = Pattern.compile("(&|^)" + lastKey + "=([^&]*)");
urlPart = com.dremio.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(partToExtract.start,
partToExtract.end, partToExtract.buffer);
}

@Override
public void eval() {
// Compiles pattern for the key given.
String key = com.dremio.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(queryKey.start, queryKey.end,
queryKey.buffer);
if (!key.equals(lastKey)) {
pattern = Pattern.compile("(&|^)" + key + "=([^&]*)");
}
lastKey = key;

String url = com.dremio.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(in.start, in.end, in.buffer);
Optional<String> extractValue = com.dremio.exec.expr.fn.impl.StringFunctionHelpers.parseURLQueryKey(url, urlPart,
pattern, errCtx);
extractValue.ifPresent(val ->{
out.isSet = 1;
byte[] buf = val.getBytes();
buffer.setBytes(0, buf);

out.start = 0;
out.end = buf.length;
out.buffer = buffer;
});
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -343,4 +343,23 @@ public void stringfuncs(){
});
}

@Test
public void parseUrl(){
testFunctions(new Object[][]{
{"parse_url('http://facebook.com/path/p1.php?query=1', 'PROTOCOL')", "http"},
{"parse_url('http://facebook.com/path/p1.php?query=1', 'HOST')", "facebook.com"},
{"parse_url('http://127.0.0.0:8080/path/p1.php?query=1', 'HOST')", "127.0.0.0"},
{"parse_url('http://facebook.com/path/p1.php?query=1', 'AUTHORITY')", "facebook.com"},
{"parse_url('http://127.0.0.0:8080/path/p1.php?query=1', 'AUTHORITY')", "127.0.0.0:8080"},
{"parse_url('http://facebook.com/path/p1.php?query=1#ref', 'REF')", "ref"},
{"parse_url('http://facebook.com/path/p1.php?query=1', 'QUERY')", "query=1"},
{"parse_url('http://facebook.com/path/p1.php?query=1', 'QUERY', 'query')", "1"},
{"parse_url('http://facebook.com/path/p1.php?query=1&k1=v1', 'QUERY', 'query')", "1"},
{"parse_url('http://facebook.com/path/p1.php?query=1&k1=v1', 'QUERY', 'k1')", "v1"},
{"parse_url('http://facebook.com/path/p1.php?query=1', 'FILE')", "/path/p1.php?query=1"},
{"parse_url('http://facebook.com/path/p1.php?query=1', 'PATH')", "/path/p1.php"},
{"parse_url('http://userinfo@facebook.com/path/p1.php?query=1', 'USERINFO')", "userinfo"},
});
}

}