Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

48-interval-schema #50

Merged
merged 3 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,20 @@ repos:
- id: end-of-file-fixer
- id: mixed-line-ending
repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
rev: v4.6.0
- repo: https://github.com/commitizen-tools/commitizen
rev: v3.18.0 # automatically updated by Commitizen
rev: v3.27.0 # automatically updated by Commitizen
hooks:
- id: commitizen
stages: [commit-msg]
- hooks:
- id: flake8
repo: https://github.com/pycqa/flake8
rev: 7.0.0
rev: 7.1.0
- hooks:
- id: black
repo: https://github.com/psf/black
rev: 24.2.0
rev: 24.4.2
- hooks:
- args:
- --profile
Expand Down
30 changes: 16 additions & 14 deletions distill/core/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import json

from pydantic import BaseModel
from pydantic.type_adapter import TypeAdapter
from typing import Dict, Union

from pksuid import PKSUID
from pydantic import BaseModel, parse_obj_as
from pydantic.type_adapter import TypeAdapter

from distill.core.types import JsonDict, JSONSerializable
from distill.schemas.userale import UserAleSchema
from distill.core.types import JsonDict, JSONSerializable, UserAleSchema

ta = TypeAdapter(JsonDict)


class Log:
"""
Base class for log object representation.
Expand All @@ -34,27 +34,29 @@ class Log:
defaults to UserAle log schema
"""

def __init__(self, data: Union[str, JsonDict], schema=UserAleSchema):
if not issubclass(schema, BaseModel):
def __init__(self, data: Union[str, JsonDict], schema=None):
if schema is None:
schema = UserAleSchema
elif issubclass(schema, BaseModel):
raise TypeError("schema should inherit from pydantic.BaseModel")

if isinstance(data, str):
schema.model_validate_json(data, strict=True)
hash_sfx = str(hash(data))
data = json.loads(data)
elif ta.validate_python(data):
hash_sfx = str(hash(json.dumps(data)))
schema.model_validate(data, strict=True)
else:
raise TypeError("ERROR: " + str(type(data)) + " data should be either a string or a JsonDict")
self.data = schema(**data)

self.id = PKSUID("log_" + hash_sfx, schema._timestamp(self.data))
raise TypeError(
"ERROR: "
+ str(type(data))
+ " data should be either a string or a JsonDict"
)
self.data = schema.validate_python(data)

self.id = PKSUID("log_" + hash_sfx, self.data._timestamp())

def to_json(self) -> str:
return self.data.model_dump_json(by_alias=True)

def to_dict(self) -> JsonDict:
return self.data.model_dump(by_alias=True)

25 changes: 15 additions & 10 deletions distill/core/types.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,27 @@
from typing import Union, List, Dict
from typing import Dict, List, Union

from pydantic.type_adapter import TypeAdapter
from typing_extensions import TypeAliasType

from distill.schemas.userale import UserAleRawSchema, UserAleIntervalSchema

# TypeAliasType is necessary to avoid recursion error when validating this
# type with Pydantic
JSONSerializable = TypeAliasType(
"JSONSerializable",
Union[str,
int,
float,
bool,
None,
List['JSONSerializable'],
Dict[str, 'JSONSerializable']
Union[
str,
int,
float,
bool,
None,
List["JSONSerializable"],
Dict[str, "JSONSerializable"],
],
)

JsonDict = Dict[str, 'JSONSerializable']
JsonDict = Dict[str, "JSONSerializable"]

Timestamp = Union[str, int, float]


UserAleSchema = TypeAdapter(Union[UserAleRawSchema, UserAleIntervalSchema])
89 changes: 69 additions & 20 deletions distill/schemas/userale.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional
from datetime import datetime
from typing import List, Optional

from pydantic import AliasGenerator, BaseModel, Field, field_serializer, field_validator
from pydantic import BaseModel, Field, AliasGenerator, field_serializer, field_validator
from pydantic.alias_generators import to_camel
from pydantic.config import ConfigDict

from .base import BaseSchema
from datetime import datetime

from distill.schemas.base import BaseSchema

class Browser(BaseModel):
browser: str
version: str


class Location(BaseModel):
x: Optional[int]
y: Optional[int]
Expand All @@ -42,38 +39,90 @@ class ScrnRes(BaseModel):
class Details(BaseModel):
window: bool


class UserAleSchema(BaseSchema):
class UserAleBaseSchema(BaseSchema):
"""
A raw or custom log produced by UserAle
"""

model_config = ConfigDict(
title="Log",
alias_generator=AliasGenerator(
validation_alias=to_camel, serialization_alias=to_camel
),
)

target: str
path: List[str]
page_url: str
page_title: str
page_referrer: str
browser: Browser
client_time: int
micro_time: int = Field(..., lt=2)
location: Location
scrn_res: ScrnRes
type_field: str = Field(..., validation_alias="type", serialization_alias="type")
log_type: str
user_action: bool
details: Details
user_id: str
tool_version: Optional[str]
tool_name: Optional[str]
userale_version: Optional[str]
session_id: str
http_session_id: str
browser_session_id: str

def _timestamp(self):
"""
Returns:
float: POSIX time from userALE log's client_time field
"""
pass


class UserAleIntervalSchema(UserAleBaseSchema):
"""
A raw or custom log produced by UserAle
"""

model_config = ConfigDict(
title="IntervalLog",
alias_generator=AliasGenerator(
validation_alias=to_camel, serialization_alias=to_camel
),
)

count: int
duration: int
start_time: int
end_time: int
target_change: bool
type_change: bool

@field_validator("start_time", "end_time")
def validate_st(cls, st: float):
return datetime.fromtimestamp(st / 1000)

@field_serializer("start_time", "end_time")
def serialize_st(self, st: datetime):
return int(st.timestamp() * 1000)

# add in end_time validator and serializer under same tag

def _timestamp(self):
"""
Returns:
float: POSIX time from userALE log's start_time field
"""
return self.start_time.timestamp()


class UserAleRawSchema(UserAleBaseSchema):
"""
A raw or custom log produced by UserAle
"""

model_config = ConfigDict(
title="RawLog",
alias_generator=AliasGenerator(
validation_alias=to_camel, serialization_alias=to_camel
),
)

client_time: int
micro_time: int = Field(..., lt=2)
location: Location
scrn_res: ScrnRes
details: Details

@field_validator("client_time")
def validate_ct(cls, ct: float):
Expand Down
1 change: 1 addition & 0 deletions tests/data/log_interval_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"target": "nav.navigation-bar-desktop", "path": ["nav.navigation-bar-desktop","body.body","html.no-js","#document","Window"], "pageUrl": "https://beam.apache.org/case-studies/", "pageTitle": "Case Studies", "pageReferrer": "https://beam.apache.org/", "browser": {"browser": "chrome","version": "114.0.0"}, "count": 1, "duration": 129, "startTime": 1708447014463, "endTime": 1708447014592, "type": "mouseover", "logType": "interval", "targetChange": true, "typeChange": false, "userAction": false, "userId": "MD", "toolVersion": "", "toolName": "", "useraleVersion": "2.4.0", "sessionId": "session_1708446947239", "httpSessionId": "72798a8ad776417183b1aa14e03c3132", "browserSessionId": "06b0db1ab30e8e92819ba3d4091b83bc"}
2 changes: 1 addition & 1 deletion tests/data/log_test_data.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"target": "#document","path": [ "Window" ], "pageUrl": "https://github.com/apache/flagon/tree/master/docker", "pageTitle": "flagon/docker at master · apache/flagon · GitHub", "pageReferrer": "https://gov.teams.microsoft.us/", "browser": { "browser": "chrome", "version": "116.0.0" }, "clientTime": 1719530111079, "microTime": 0,"location": { "x": null, "y": null }, "scrnRes": { "width": 1349, "height":954 }, "type": "load", "logType": "raw", "userAction": true, "details": {"window": true }, "userId": "nobody", "toolVersion": null, "toolName":"test_app", "useraleVersion": "2.3.0", "sessionId":"session_1719530074303"}
{"target": "#document","path": [ "Window" ], "pageUrl": "https://github.com/apache/flagon/tree/master/docker", "pageTitle": "flagon/docker at master · apache/flagon · GitHub", "pageReferrer": "https://gov.teams.microsoft.us/", "browser": { "browser": "chrome", "version": "116.0.0" }, "clientTime": 1719530111079, "microTime": 0,"location": { "x": null, "y": null }, "scrnRes": { "width": 1349, "height":954 }, "type": "load", "logType": "raw", "userAction": true, "details": {"window": true }, "userId": "nobody", "toolVersion": null, "toolName":"test_app", "useraleVersion": "2.3.0", "sessionId":"session_1719530074303", "httpSessionId": "72798a8ad776417183b1aa14e03c3132", "browserSessionId": "06b0db1ab30e8e92819ba3d4091b83bc"}
30 changes: 23 additions & 7 deletions tests/test_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@

import json
import os
from datetime import datetime

from pydantic import ValidationError

from distill.core.log import Log
from distill.core.log import Log
from tests.data_config import DATA_DIR
from datetime import datetime


def test_log_constructor():
exception_thrown = False
try:
_ = Log(data="garbage data")
_ = Log(data='{"garbage data": "bad"}')
except ValidationError:
exception_thrown = True
assert exception_thrown == True
Expand All @@ -48,14 +48,24 @@ def test_log_constructor():
assert id.get_timestamp() == 1719530111079 // 1000
assert id.prefix.startswith("log_")

data = load_interval_log()
test_interval_log = Log(data=data)
assert test_interval_log is not None
id = test_interval_log.id
assert id.get_timestamp() == 1708447014463 // 1000


def test_log_serialize():
data = load_log()
test_log = Log(data=data)

correct_str = json.dumps(
json.loads(data), separators=(",", ":"), ensure_ascii=False
)
# correct_str = json.dumps(
# json.loads(data), separators=(",", ":"), ensure_ascii=False
# )
# Hardcoding this for now because creating a polymorphic model does not
# preserve order in pydantic. Our data is still correct but not in the
# original order. There doesn't seem to be an easy way to fix this right now
correct_str = '{"target":"#document","path":["Window"],"pageUrl":"https://github.com/apache/flagon/tree/master/docker","pageTitle":"flagon/docker at master · apache/flagon · GitHub","pageReferrer":"https://gov.teams.microsoft.us/","browser":{"browser":"chrome","version":"116.0.0"},"type":"load","logType":"raw","userAction":true,"userId":"nobody","toolVersion":null,"toolName":"test_app","useraleVersion":"2.3.0","sessionId":"session_1719530074303","httpSessionId":"72798a8ad776417183b1aa14e03c3132","browserSessionId":"06b0db1ab30e8e92819ba3d4091b83bc","clientTime":1719530111079,"microTime":0,"location":{"x":null,"y":null},"scrnRes":{"width":1349,"height":954},"details":{"window":true}}'
serialized_data = test_log.to_json()
assert serialized_data == correct_str

Expand All @@ -73,7 +83,7 @@ def test_log_normalize_timestamp():
data = load_log()
test_log = Log(data=data)

# note provided UserAle schema has clientTime in milliseconds but need it in
# note provided UserAle schema has clientTime in milliseconds but need it in
# seconds to be able to parse
correct_ms = 1719530111079
correct_dt = datetime.fromtimestamp(correct_ms / 1000)
Expand All @@ -86,3 +96,9 @@ def load_log() -> str:
with open(os.path.join(DATA_DIR, "log_test_data.json")) as f:
data = f.readline()
return data


def load_interval_log() -> str:
with open(os.path.join(DATA_DIR, "log_interval_data.json")) as f:
data = f.readline()
return data
Loading