Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add utility for transforming Trino ROW type columns into Ruby hashes #117

Merged
merged 5 commits into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions lib/trino/client/column_value_parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
module Trino::Client
class ColumnValueParser
INSIDE_MATCHING_PARENS_REGEX = /\((?>[^)(]+|\g<0>)*\)/

attr_reader :name, :type, :scalar_parser

def initialize(column, scalar_parser = nil)
@name = column.name
@type = prepare_type_for_parsing(column.type)
@scalar_parser = scalar_parser
end

# Public: Parse the value of a row's field by using its column's Trino type.
# Trino types can be scalars like VARCHAR and TIMESTAMP or complex types
# like ARRAY and ROW. ROW types are treated as objects.
# An ARRAY column's type is an array of types as you'd expect. A ROW
# column's type is a comma-separated list of space-separated (name, type) tuples.
#
# data - The value of a row's field. Can be a string, number, an array of those,
# or an arrays of arrays, etc.
# dtype - The Trino type string of the column. See above explanation.
#
# Returns:
# - The given value for strings and numbers
# - A Time for timestamps
# - A Hash of { field1 => value1, field2 => value2, ...etc } for row types
# - An array of the above for array types
def value(data, dtype = type)
# Convert Trino ARRAY elements into Ruby Arrays
if starts_with?(dtype, 'array(')
return parse_array_element(data, dtype)

# Convert Trino ROW elements into Ruby Hashes
elsif starts_with?(dtype, 'row(')
return parse_row_element(data, dtype)
kmcq marked this conversation as resolved.
Show resolved Hide resolved

# If defined, use scalar_parser to convert scalar types
elsif !scalar_parser.nil?
return scalar_parser.call(data, dtype)
end
kmcq marked this conversation as resolved.
Show resolved Hide resolved

# Otherwise, values are returned unaltered
data
end

private

# Private: Remove quotation marks and handle recent versions of
# Trino having a 'with time zone' suffix on some fields that breaks
# out assumption that types don't have spaces in them.
#
# Returns a string.
def prepare_type_for_parsing(type)
type.gsub('"', '').gsub(' with time zone', '_with_time_zone')
end

def parse_array_element(data, dtype)
# If the element is empty, return an empty array
return [] if blank?(data)

# Inner data type will be the current dtype with `array(` and `)` chopped off
inner_dtype = dtype.match(INSIDE_MATCHING_PARENS_REGEX)[0][1..-2]

data.map { |inner_data| value(inner_data, inner_dtype) }
end

def parse_row_element(data, dtype)
# If the element is empty, return an empty object
return {} if blank?(data)

parsed_row_element = {}

inner_dtype = dtype.match(INSIDE_MATCHING_PARENS_REGEX)[0][1..-2]
elems = inner_dtype.split(' ')
num_elems_to_skip = 0
field_position = 0

# Iterate over each datatype of the row and mutate parsed_row_element
# to have a key of the field name and value for that field's value.
elems.each_with_index do |field, i|
# We detected an array or row and are skipping all of the elements within it
# since its conversion was handled by calling `value` recursively.
if num_elems_to_skip.positive?
num_elems_to_skip -= 1
next
end

# Field names never have these characters and are never the last element.
next if field.include?(',') || field.include?('(') || field.include?(')') || i == elems.length - 1

type = elems[(i + 1)..].join(' ')

# If this row has a nested array or row, the type of this field is that array or row's type.
if starts_with?(type, 'array(') || starts_with?(type, 'row(')
datatype = type.sub(/\(.*/, '')
type = "#{datatype}#{type.match(INSIDE_MATCHING_PARENS_REGEX)[0]}"
num_elems_to_skip = type.split(' ').length # see above comment about num_elems_to_skip
end

parsed_row_element[field] = value(data[field_position], type)
field_position += 1
end

parsed_row_element
end

def blank?(obj)
obj.respond_to?(:empty?) ? !!obj.empty? : !obj
end

def starts_with?(str, prefix)
prefix.respond_to?(:to_str) && str[0, prefix.length] == prefix
end
Comment on lines +107 to +113
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are adopted from ActiveSupport.

end
end
28 changes: 28 additions & 0 deletions lib/trino/client/query.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ module Trino::Client
require 'faraday'
require 'faraday/gzip'
require 'faraday/follow_redirects'
require 'trino/client/column_value_parser'
require 'trino/client/models'
require 'trino/client/errors'
require 'trino/client/faraday_client'
Expand All @@ -44,6 +45,19 @@ def self.faraday_client(options)
Trino::Client.faraday_client(options)
end

def self.transform_row(column_value_parsers, row)
row_object = {}

row.each_with_index do |element, i|
column = column_value_parsers[i]
value = column.value(element)

row_object[column.name] = value
end

row_object
end

def initialize(api)
@api = api
end
Expand Down Expand Up @@ -86,6 +100,20 @@ def columns
return @api.current_results.columns
end

def column_value_parsers
@column_value_parsers ||= columns.map {|column|
ColumnValueParser.new(column)
}
end

def transform_rows
rows.map(&:transform_row)
end

def transform_row(row)
self.class.transform_row(column_value_parsers, row)
end

def rows
rows = []
each_row_chunk {|chunk|
Expand Down
61 changes: 55 additions & 6 deletions spec/client_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,17 @@
[
Models::Column.new(name: 'animal', type: 'string'),
Models::Column.new(name: 'score', type: 'integer'),
Models::Column.new(name: 'name', type: 'string')
Models::Column.new(name: 'name', type: 'string'),
Models::Column.new(name: 'foods', type: 'array(string string)'),
Models::Column.new(name: 'traits', type: 'row(breed string, num_spots integer)')
]
end

it 'multiple rows' do
rows = [
['dog', 1, 'Lassie'],
['horse', 5, 'Mr. Ed'],
['t-rex', 37, 'Doug']
['dog', 1, 'Lassie', ['kibble', 'peanut butter'], ['spaniel', 2]],
['horse', 5, 'Mr. Ed', ['hay', 'sugar cubes'], ['some horse', 0]],
['t-rex', 37, 'Doug', ['rodents', 'small dinos'], ['dino', 0]]
]
client.stub(:run).and_return([columns, rows])

Expand All @@ -27,18 +29,61 @@
expect(rehashed[0]['animal']).to eq 'dog'
expect(rehashed[0]['score']).to eq 1
expect(rehashed[0]['name']).to eq 'Lassie'
expect(rehashed[0]['foods']).to eq ['kibble', 'peanut butter']
expect(rehashed[0]['traits']).to eq ['spaniel', 2]

expect(rehashed[0].values[0]).to eq 'dog'
expect(rehashed[0].values[1]).to eq 1
expect(rehashed[0].values[2]).to eq 'Lassie'
expect(rehashed[0].values[3]).to eq ['kibble', 'peanut butter']
expect(rehashed[0].values[4]).to eq ['spaniel', 2]

expect(rehashed[1]['animal']).to eq 'horse'
expect(rehashed[1]['score']).to eq 5
expect(rehashed[1]['name']).to eq 'Mr. Ed'
expect(rehashed[1]['foods']).to eq ['hay', 'sugar cubes']
expect(rehashed[1]['traits']).to eq ['some horse', 0]

expect(rehashed[1].values[0]).to eq 'horse'
expect(rehashed[1].values[1]).to eq 5
expect(rehashed[1].values[2]).to eq 'Mr. Ed'
expect(rehashed[1].values[3]).to eq ['hay', 'sugar cubes']
expect(rehashed[1].values[4]).to eq ['some horse', 0]
end

it 'transforms rows into Ruby objects' do
rows = [
['dog', 1, 'Lassie', ['kibble', 'peanut butter'], ['spaniel', 2]],
['horse', 5, 'Mr. Ed', ['hay', 'sugar cubes'], ['some horse', 0]],
['t-rex', 37, 'Doug', ['rodents', 'small dinos'], ['dino', 0]]
]
client.stub(:run).and_return([columns, rows])

columns, rows = client.run('fake query')
column_value_parsers = columns.map { |column| Trino::Client::ColumnValueParser.new(column) }
transformed_rows = rows.map { |row| Trino::Client::Query.transform_row(column_value_parsers, row) }

expect(transformed_rows[0]).to eq({
"animal" => "dog",
"score" => 1,
"name" => "Lassie",
"foods" => ["kibble", "peanut butter"],
"traits" => {
"breed" => "spaniel",
"num_spots" => 2,
},
})

expect(transformed_rows[1]).to eq({
"animal" => "horse",
"score" => 5,
"name" => "Mr. Ed",
"foods" => ["hay", "sugar cubes"],
"traits" => {
"breed" => "some horse",
"num_spots" => 0,
},
})
end

it 'empty results' do
Expand All @@ -58,17 +103,21 @@
"animal" => "wrong",
"score" => "count",
"name" => nil,
"foods" => nil,
"traits" => nil
}]
end

it 'handles too many result columns' do
rows = [['wrong', 'count', 'too', 'much', 'columns']]
rows = [['wrong', 'count', 'too', 'too', 'too', 'much', 'columns']]
client.stub(:run).and_return([columns, rows])

expect(client.run_with_names('fake query')).to eq [{
"animal" => "wrong",
"score" => "count",
"name" => 'too',
"name" => "too",
"foods" => "too",
"traits" => "too"
}]
end
end
Expand Down
125 changes: 125 additions & 0 deletions spec/column_value_parser_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
require 'spec_helper'

describe Trino::Client::ColumnValueParser do
def column_value(data, type, scalar_parser = nil)
column = Struct.new(:type, :name).new(type)
Trino::Client::ColumnValueParser.new(column, scalar_parser).value(data)
end

it 'parses varchar values' do
data = 'a string'
type = 'varchar'
expected_value = 'a string'
expect(column_value(data, type)).to eq(expected_value)
end

it 'converts scalar values if configured to do so' do
data = '2022-07-01T14:53:02Z'
type = 'timestamp with time zone'
scalar_parser = ->(value, _dtype) { Time.parse(value) }
expected_value = Time.parse(data)
expect(column_value(data, type, scalar_parser)).to eq(expected_value)
end

it 'parses array type values' do
data = [1, 2, 3, 4]
type = 'array(integer, integer, integer, integer)'
expected_value = [1, 2, 3, 4]
expect(column_value(data, type)).to eq(expected_value)
end

it 'parses row type values' do
data = [
'userId',
'userLogin',
'SKU_FREE',
'TYPE_USER',
'2022-07-01T14:53:02Z',
''
]
type = 'row(id varchar, "name" varchar, plan_sku varchar, type varchar, created_at timestamp with time zone, organization_tenant_name varchar)'
expected_value = {
'id' => 'userId',
'name' => 'userLogin',
'plan_sku' => 'SKU_FREE',
'type' => 'TYPE_USER',
'created_at' => '2022-07-01T14:53:02Z',
'organization_tenant_name' => ''
}
expect(column_value(data, type)).to eq(expected_value)
end

it 'parses an array of row type values' do
data = [[
'userId',
'userLogin',
'SKU_FREE',
'TYPE_USER',
'2022-07-01T14:53:02Z',
''
]]
type = 'array(row(id varchar, "name" varchar, plan_sku varchar, type varchar, created_at timestamp with time zone, organization_tenant_name varchar))'
expected_value = [{
'id' => 'userId',
'name' => 'userLogin',
'plan_sku' => 'SKU_FREE',
'type' => 'TYPE_USER',
'created_at' => '2022-07-01T14:53:02Z',
'organization_tenant_name' => ''
}]
expect(column_value(data, type)).to eq(expected_value)
end

it 'parses row type values that have an array in them' do
data = [
'userId',
%w[userLogin1 userLogin2],
'value'
]
type = 'row(id varchar, logins array(varchar), onemore varchar)'
expected_value = {
'id' => 'userId',
'logins' => %w[userLogin1 userLogin2],
'onemore' => 'value'
}
expect(column_value(data, type)).to eq(expected_value)
end

it 'parses row type values that have a row in them' do
data = [
'userId',
['userLogin', '2022-07-01T14:53:02Z', 1234],
'value'
]
type = 'row(id varchar, subobj row(login varchar, created_at timestamp with time zone, id integer), onemore varchar)'
expected_value = {
'id' => 'userId',
'subobj' => {
'login' => 'userLogin',
'created_at' => '2022-07-01T14:53:02Z',
'id' => 1234
},
'onemore' => 'value'
}
expect(column_value(data, type)).to eq(expected_value)
end

it 'parses row type values that have nested rows in them' do
data = [
'userId',
['userLogin', '2022-07-01T14:53:02Z', [1234]],
'value'
]
type = 'row(id varchar, subobj row(login varchar, created_at timestamp with time zone, id row(subid integer)), onemore varchar)'
expected_value = {
'id' => 'userId',
'subobj' => {
'login' => 'userLogin',
'created_at' => '2022-07-01T14:53:02Z',
'id' => {'subid' => 1234}
},
'onemore' => 'value'
}
expect(column_value(data, type)).to eq(expected_value)
end
end
Loading