Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
johnfraney committed Jan 14, 2025
1 parent f0261a4 commit 0d2a75c
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 41 deletions.
161 changes: 140 additions & 21 deletions blurry/schema_validation.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,167 @@
import importlib
from collections.abc import MutableMapping
from datetime import datetime
from pathlib import Path
from typing import Literal

from pydantic.v1 import ValidationError
from pydantic import BaseModel, Field, ValidationError
from rich.console import Console

from blurry.settings import SETTINGS


Boolean = Literal["True"] | Literal["False"]


class Thing(BaseModel):
additionalType: str | None = None
alternateName: str | None = None
description: str | None = None
image: "ImageObject | None" = None
name: str | None = None


class Person(Thing):
type_: str = Field(default="Person", alias="@type", frozen=True)
additionalName: str | None = None
affiliation: "Organization | None" = None
email: str | None = None
familyName: str | None = None
givenName: str | None = None
honorificPrefix: str | None = None
honorificSuffix: str | None = None
telephone: str | None = None
worksFor: "Organization | None" = None


class Organization(Thing):
type_: str = Field(default="Organization", alias="@type", frozen=True)


class CreativeWork(Thing):
type_: str = Field(default="CreativeWork", alias="@type", frozen=True)
abstract: str | None = None
author: Person | Organization | None = None
creator: Person | Organization | None = None
dateCreated: datetime | None = None
dateModified: datetime | None = None
datePublished: datetime | None = None
headline: str | None = None


class MediaObject(CreativeWork):
type_: str = Field(default="MediaObject", alias="@type", frozen=True)
associatedArticle: "NewsArticle | None" = None
bitrate: str | None = None
contentSize: str | None = None
contentUrl: str | None = None
url: str | None = None
embedUrl: str | None = None


class ImageObject(MediaObject):
type_: str = Field(default="ImageObject", alias="@type", frozen=True)
caption: MediaObject | str | None = None
embeddedTextCaption: str | None = None
# exifData: PropertyValue | Text | None
representativeOfPage: Boolean | None = None


class WebSite(CreativeWork):
type_: str = Field(default="WebSite", alias="@type", frozen=True)
issn: str | None = None


class WebPage(CreativeWork):
type_: str = Field(default="WebPage", alias="@type", frozen=True)
# breadcrumb: BreadcrumbList | Text | None
lastReviewed: datetime | None = None
# mainContentOfPage: WebPageElement | None
primaryImageOfPage: ImageObject | None = None
relatedLink: str | None = None
reviewedBy: Organization | Person | None = None
significantLink: str | None = None
# speakable: SpeakableSpecification | None
# specialty: Specialty | None


class Article(CreativeWork):
type_: str = Field(default="Article", alias="@type", frozen=True)
author: Person | Organization # pyright: ignore to narrow the type
datePublished: datetime # pyright: ignore to narrow the type
headline: str # pyright: ignore to narrow the type
image: ImageObject | str | list[ImageObject | str] # pyright: ignore to permit Google's recommendation


class NewsArticle(Article):
type_: str = Field(default="NewsArticle", alias="@type", frozen=True)
dateline: str | None = None
printColumn: str | None = None
printEdition: str | None = None
printPage: str | None = None
printSection: str | None = None


class SocialMediaPosting(Article):
type_: str = Field(default="SocialMediaPosting", alias="@type", frozen=True)
sharedContent: CreativeWork | None = None


class BlogPosting(SocialMediaPosting):
type_: str = Field(default="BlogPosting", alias="@type", frozen=True)


# class Review(BaseModel):
# author: Person | Organization
# itemReviewed: (
# Book
# | Course
# | CreativeWorkSeason
# | CreativeWorkSeries
# | Episode
# | Event
# | Game
# | HowTo
# | LocalBusiness
# | MediaObject
# | Movie
# | MusicPlaylist
# | MusicRecording
# | Organization
# | Product
# | Recipe
# | SoftwareApplication
# )


def validate_front_matter_as_schema(
path: Path, schema_variables: MutableMapping, console: Console
):
"""
Validates schema data using pydantic_schemaorg, disallowing extra fields
Validates schema data using partial Schema.org types based on Google's support for them:
https://developers.google.com/search/docs/appearance/structured-data/search-gallery
"""
schema_type = schema_variables["@type"]
schematype = schema_variables["@type"]

if mapped_schema_type := SETTINGS["TEMPLATE_SCHEMA_TYPES"].get(schema_type):
schema_type = mapped_schema_type
if mapped_schematype_ := SETTINGS["TEMPLATE_SCHEMA_TYPES"].get(schematype):
schematype = mapped_schematype_

# Import pydantic_schemaorg model
# Get the schema model class from this module
try:
pydantic_schemaorg_model_module = importlib.import_module(
f"pydantic2_schemaorg.{schema_type}"
)
module = importlib.import_module("blurry.schema_validation")
except ModuleNotFoundError:
console.print(
f"{path}: Could not find Schema type for {schema_type}. Skipping."
)
console.print(f"{path}: Could not import module.")
return

SchemaModel = getattr(pydantic_schemaorg_model_module, schema_type)

# Create new Pydantic model that forbids extra fields
class SchemaModelWithoutExtraFields(SchemaModel, extra="forbid"): # type: ignore
pass
SchemaModel = getattr(module, schematype)

# Validate model and print errors
try:
SchemaModelWithoutExtraFields(**schema_variables)
SchemaModel(**schema_variables)
except ValidationError as e:
for error in e.errors():
msg = error["msg"]
loc = error["loc"]
console.print(
f"{path}: {schema_type} schema validation error: {msg}: {loc}"
)
if len(loc) == 1:
loc = loc[0]
console.print(f"{path}: {schematype} schema validation error: {msg}: {loc}")
22 changes: 4 additions & 18 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ rich = "^13.9.4"
selectolax = "^0.3.27"
typer = "^0.15.1"
htmlmin2 = "^0.1.13"
pydantic2-schemaorg = "0.3.0"
dpath = "^2.2.0"
jinja2-simple-tags = "^0.6.1"
jinjax = "^0.48"
pydantic = "^2.10.5"

[tool.poetry.scripts]
blurry = 'blurry:main'
Expand Down
26 changes: 25 additions & 1 deletion tests/test_schema_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,30 @@ def test_validate_front_matter_as_schema_with_extra_value():
test_console = Console()
test_console.print = MagicMock()
validate_front_matter_as_schema(path, front_matter_with_extra_value, test_console)
assert not test_console.print.called


MARKDOWN_WITH_WRONG_TYPE_IN_TOML_FRONT_MATTER = """
+++
"@type" = "WebPage"
name = "Introduction"
abstract = "A Python-powered static site generator with a focus on page speed and SEO."
datePublished = 2023-04-09
dateModified = true
+++
# Blurry: A Python-powered static site generator
""".strip()


def test_validate_front_matter_as_schema_with_wrong_type():
_, front_matter_with_extra_value = get_data(
MARKDOWN_WITH_WRONG_TYPE_IN_TOML_FRONT_MATTER
)
path = Path("pages/intro.md")
test_console = Console()
test_console.print = MagicMock()
validate_front_matter_as_schema(path, front_matter_with_extra_value, test_console)
test_console.print.assert_called_with(
"pages/intro.md: WebPage schema validation error: extra fields not permitted: ('extra_value',)"
"pages/intro.md: WebPage schema validation error: Input should be a valid datetime: dateModified"
)

0 comments on commit 0d2a75c

Please sign in to comment.