Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Surface concept schema changes via sdk query updates #126

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
bb93851
Update vespa test files
THOR300 Sep 26, 2024
9d341ef
Updating test data to contain indent.
Sep 30, 2024
6977417
Updating tests data to contain concepts.
Sep 30, 2024
2aabedf
Adding test query for the concepts data.
Sep 30, 2024
c5ad5b7
Bumping the version.
Sep 30, 2024
71db3c4
Marking test as a vespa test.
Sep 30, 2024
0818e8b
Update vespa test files
THOR300 Sep 30, 2024
4d1a0a7
Bumping the version.
Sep 30, 2024
9199af2
Merging in the latest schema changes.
Sep 30, 2024
adf8972
Merge branch 'feature/update-test-data-to-include-kg-data' into featu…
Sep 30, 2024
131d4cc
Updating the test queries.
Sep 30, 2024
08014a5
Update vespa test files
THOR300 Oct 3, 2024
cbbea27
Updating the latest schema and test data.
Oct 3, 2024
9b7d411
Merge branch 'feature/update-test-data-to-include-kg-data' into featu…
Oct 3, 2024
c990ed4
Updating the vespa query tests.
Oct 3, 2024
ce25302
Updating the tests and data.
Oct 3, 2024
f69b610
Precommit fix.
Oct 3, 2024
ed551bb
Adding a failing test and the ConceptFilter object.
Oct 3, 2024
2030089
Pre-commit fix.
Oct 7, 2024
4450bef
Clean up.
Oct 7, 2024
3e87a32
Updating the concept filters test and related code.
Oct 7, 2024
fbf4adb
Refactoring the search query test.
Oct 7, 2024
74d39c1
Refactoring the search query test.
Oct 7, 2024
36c8a72
Typo.
Oct 7, 2024
f4d7f99
Merging in the latest changes from main.
Oct 7, 2024
ca7521b
Adding the correct version bump.
Oct 7, 2024
e7c74c1
Pre-commit fix.
Oct 7, 2024
e03c5e1
Adding conditonal to assertion.
Oct 7, 2024
ee8f83e
Incrementing the version.
Oct 7, 2024
18d8308
refactoring.
Oct 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 79 additions & 1 deletion src/cpr_sdk/models/search.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from datetime import datetime
from typing import List, Optional, Sequence
from typing import List, Literal, Optional, Sequence

from pydantic import (
AliasChoices,
Expand Down Expand Up @@ -45,6 +45,75 @@ class MetadataFilter(BaseModel):
value: str


class ConceptFilter(BaseModel):
"""A filter for concept fields"""

name: Literal["name", "id", "model", "timestamp", "parent_concept_ids_flat"]
value: str

@model_validator(mode="after")
def validate_parent_concept_ids_flat(self) -> "ConceptFilter":
"""
Validate parent_concept_ids_flat field.

In the schema we comma separate values in the parent_concept_ids_flat field.
This means we must ensure that the last character is a comma to avoid the
situation below:

E.g. querying parent_concept_ids_flat on "Q1" should only return "Q1" but you
will also return "Q12, Q123" which is invalid.

To get around this we query on "Q1," instead using the comma suffix to separate
values.
"""
if self.name == "parent_concept_ids_flat" and self.value[-1] != ",":
self.value = self.value + ","
return self


class Concept(BaseModel):
"""
A concept extracted from a passage of text.

This refers to a span of text within passage that holds a concept.
E.g. "Adaptation strategy" is a concept within a passage starting at index 0 and
ending at index 17, classified by model "environment_model_1" on the 12th Jan at
12:00.
"""

id: str
name: str
parent_concepts: List[dict[str, str]]
THOR300 marked this conversation as resolved.
Show resolved Hide resolved
parent_concept_ids_flat: str
model: str
end: int
start: int
timestamp: datetime

@model_validator(mode="after")
def validate_parent_concept_ids_flat(self) -> "Concept":
"""
Validate parent_concept_ids_flat field.

This field should hold the same concepts as the parent_concepts field.
"""
parent_concept_ids_flattened = ",".join(
[parent_concept["name"] for parent_concept in self.parent_concepts]
)

if parent_concept_ids_flattened[-1] != ",":
parent_concept_ids_flattened += ","

if not self.parent_concept_ids_flat == parent_concept_ids_flattened:
raise ValueError(
THOR300 marked this conversation as resolved.
Show resolved Hide resolved
"parent_concept_ids_flat must be a comma separated list of parent "
"concept names. "
f"Received parent_concept_ids_flat data: {self.parent_concept_ids_flat},"
f"received parent_concept names: {parent_concept_ids_flattened}"
)
return self


class Filters(BaseModel):
"""Filterable fields in a search request"""

Expand Down Expand Up @@ -171,6 +240,11 @@ class SearchParameters(BaseModel):
E.g. [{"name": "family.sector", "value": "Price"}]
"""

concept_filters: Optional[Sequence[ConceptFilter]] = None
"""
A field and item mapping to search in the concepts field of the document passages.
"""

@model_validator(mode="after")
def validate(self):
"""Validate against mutually exclusive fields"""
Expand Down Expand Up @@ -291,6 +365,7 @@ class Hit(BaseModel):
corpus_type_name: Optional[str] = None
corpus_import_id: Optional[str] = None
metadata: Optional[Sequence[dict[str, str]]] = None
concepts: Optional[Sequence[Concept]] = None

@classmethod
def from_vespa_response(cls, response_hit: dict) -> "Hit":
Expand Down Expand Up @@ -354,6 +429,7 @@ def from_vespa_response(cls, response_hit: dict) -> "Document":
corpus_type_name=fields.get("corpus_type_name"),
corpus_import_id=fields.get("corpus_import_id"),
metadata=fields.get("metadata"),
concepts=fields.get("concepts"),
)


Expand Down Expand Up @@ -381,6 +457,7 @@ def from_vespa_response(cls, response_hit: dict) -> "Passage":
if family_publication_ts
else None
)

return cls(
family_name=fields.get("family_name"),
family_description=fields.get("family_description"),
Expand All @@ -405,6 +482,7 @@ def from_vespa_response(cls, response_hit: dict) -> "Passage":
text_block_page=fields.get("text_block_page"),
text_block_coords=fields.get("text_block_coords"),
metadata=fields.get("metadata"),
concepts=fields.get("concepts"),
)


Expand Down
2 changes: 1 addition & 1 deletion src/cpr_sdk/version.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
_MAJOR = "1"
_MINOR = "8"
_MINOR = "9"
_PATCH = "0"
_SUFFIX = ""

Expand Down
13 changes: 13 additions & 0 deletions src/cpr_sdk/yql_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,18 @@ def build_metadata_filter(self) -> Optional[str]:
return f"({' and '.join(metadata_filters)})"
return None

def build_concepts_filter(self) -> Optional[str]:
"""Create the part of the query that limits to specific concepts"""
if self.params.concept_filters:
concepts_query = []
for concept in self.params.concept_filters:
if concept.name == "parent_concept_ids_flat":
concepts_query.append(f"{concept.name} matches '{concept.value}'")
else:
concepts_query.append(f"{concept.name} contains '{concept.value}'")
return f"(concepts contains sameElement({', '.join(concepts_query)}))"
return None

def build_corpus_type_name_filter(self) -> Optional[str]:
"""Create the part of the query that limits to specific corpora"""
if self.params.corpus_type_names:
Expand Down Expand Up @@ -163,6 +175,7 @@ def build_where_clause(self) -> str:
filters.append(self.build_corpus_type_name_filter())
filters.append(self.build_corpus_import_ids_filter())
filters.append(self.build_metadata_filter())
filters.append(self.build_concepts_filter())
if f := self.params.filters:
filters.append(self._inclusive_filters(f, "family_geographies"))
filters.append(self._inclusive_filters(f, "family_geography"))
Expand Down
5 changes: 2 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import json
from pathlib import Path
import tempfile
from pathlib import Path

import pytest
import boto3
import pytest
from moto import mock_aws

from cpr_sdk.search_adaptors import VespaSearchAdapter


VESPA_TEST_SEARCH_URL = "http://localhost:8080"


Expand Down
37 changes: 37 additions & 0 deletions tests/local_vespa/test_app/schemas/document_passage.sd
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,42 @@ schema document_passage {
}
}
}

struct parent_concept {
field id type string {}
field name type string {}
}

struct concept {
field name type string {}
field id type string {}
field parent_concepts type array<parent_concept> {}
field parent_concept_ids_flat type string {}
field start type int {}
field end type int {}
field model type string {}
field timestamp type string {}
}

field concepts type array<concept> {
indexing: summary

struct-field name {
indexing: attribute
}
struct-field id {
indexing: attribute
}
struct-field parent_concept_ids_flat {
indexing: attribute
}
struct-field model {
indexing: attribute
}
struct-field timestamp {
indexing: attribute
}
}
}

import field family_document_ref.family_name as family_name {}
Expand Down Expand Up @@ -95,6 +131,7 @@ schema document_passage {
summary text_block_type {}
summary text_block_page {}
summary text_block_coords {}
summary concepts {}
}

rank-profile exact inherits default {
Expand Down
Loading
Loading