Skip to content

Commit

Permalink
Minor: Add Column Support in Protobuf Parser (#14745)
Browse files Browse the repository at this point in the history
  • Loading branch information
ulixius9 authored Jan 17, 2024
1 parent 8d053e6 commit dccfd9a
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 14 deletions.
40 changes: 29 additions & 11 deletions ingestion/src/metadata/parsers/protobuf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@
import traceback
from enum import Enum
from pathlib import Path
from typing import List, Optional
from typing import List, Optional, Union

import grpc_tools.protoc
from pydantic import BaseModel
from pydantic.main import ModelMetaclass

from metadata.generated.schema.type.schema import FieldModel
from metadata.generated.schema.entity.data.table import Column, DataType
from metadata.generated.schema.type.schema import DataTypeTopic, FieldModel
from metadata.utils.helpers import snake_to_camel
from metadata.utils.logger import ingestion_logger

Expand All @@ -37,12 +39,12 @@ class ProtobufDataTypes(Enum):
Enum for Protobuf Datatypes
"""

ERROR = 0
UNKNOWN = 0
DOUBLE = 1
FLOAT = 2
INT = 3, 4, 5, 13, 17, 18
FIXED = 6, 7, 15, 16
TYPE_BOOL = 8
BOOLEAN = 8
STRING = 9
UNION = 10
RECORD = 11
Expand Down Expand Up @@ -165,7 +167,9 @@ def get_protobuf_python_object(self, proto_path: str, file_path: str):
)
return None

def parse_protobuf_schema(self) -> Optional[List[FieldModel]]:
def parse_protobuf_schema(
self, cls: ModelMetaclass = FieldModel
) -> Optional[List[Union[FieldModel, Column]]]:
"""
Method to parse the protobuf schema
"""
Expand All @@ -177,10 +181,12 @@ def parse_protobuf_schema(self) -> Optional[List[FieldModel]]:
)

field_models = [
FieldModel(
cls(
name=instance.DESCRIPTOR.name,
dataType="RECORD",
children=self.get_protobuf_fields(instance.DESCRIPTOR.fields),
children=self.get_protobuf_fields(
instance.DESCRIPTOR.fields, cls=cls
),
)
]

Expand All @@ -196,7 +202,17 @@ def parse_protobuf_schema(self) -> Optional[List[FieldModel]]:
)
return None

def get_protobuf_fields(self, fields) -> Optional[List[FieldModel]]:
def _get_field_type(self, type_: int, cls: ModelMetaclass = FieldModel) -> str:
if type_ > 18:
return DataType.UNKNOWN.value
data_type = ProtobufDataTypes(type_).name
if cls == Column and data_type == DataTypeTopic.FIXED.value:
return DataType.INT.value
return data_type

def get_protobuf_fields(
self, fields, cls: ModelMetaclass = FieldModel
) -> Optional[List[Union[FieldModel, Column]]]:
"""
Recursively convert the parsed schema into required models
"""
Expand All @@ -205,10 +221,12 @@ def get_protobuf_fields(self, fields) -> Optional[List[FieldModel]]:
for field in fields:
try:
field_models.append(
FieldModel(
cls(
name=field.name,
dataType=ProtobufDataTypes(field.type).name,
children=self.get_protobuf_fields(field.message_type.fields)
dataType=self._get_field_type(field.type, cls=cls),
children=self.get_protobuf_fields(
field.message_type.fields, cls=cls
)
if field.type == 11
else None,
)
Expand Down
36 changes: 33 additions & 3 deletions ingestion/tests/unit/test_protobuf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from unittest import TestCase

from metadata.generated.schema.entity.data.table import Column
from metadata.parsers.protobuf_parser import ProtobufParser, ProtobufParserConfig


Expand All @@ -33,10 +34,21 @@ class ProtobufParserTests(TestCase):
F = 1; // female
O = 2; // other
}
message Result {
string url = 1;
string title = 2;
repeated string snippets = 3;
}
message PersonInfo {
int32 age = 1; // age in years
Gender gender = 2;
int32 height = 3; // height in cm
Result gender_new = 3;
int32 height = 4; // height in cm
fixed32 height_new = 5; // height in cm
bool my_bool = 6;
repeated string repeated_string = 7;
}
"""

Expand All @@ -57,10 +69,28 @@ def test_field_names(self):
field_names = {
str(field.name.__root__) for field in self.parsed_schema[0].children
}
self.assertEqual(field_names, {"height", "gender", "age"})
self.assertEqual(
field_names,
{
"height",
"gender",
"age",
"gender_new",
"height_new",
"my_bool",
"repeated_string",
},
)

def test_field_types(self):
field_types = {
str(field.dataType.name) for field in self.parsed_schema[0].children
}
self.assertEqual(field_types, {"INT", "ENUM"})
self.assertEqual(
field_types, {"INT", "ENUM", "RECORD", "FIXED", "STRING", "BOOLEAN"}
)

def test_column_types(self):
parsed_schema = self.protobuf_parser.parse_protobuf_schema(cls=Column)
field_types = {str(field.dataType.name) for field in parsed_schema[0].children}
self.assertEqual(field_types, {"INT", "ENUM", "RECORD", "STRING", "BOOLEAN"})

0 comments on commit dccfd9a

Please sign in to comment.