Skip to content

Commit

Permalink
3.2.0 expose pagerank stats for normalization/standarization
Browse files Browse the repository at this point in the history
  • Loading branch information
Binh Vu committed Oct 19, 2022
1 parent 947c063 commit 413c140
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 4 deletions.
4 changes: 2 additions & 2 deletions kgdata/wikidata/datasets/entity_pagerank.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from kgdata.wikidata.models.wdentity import WDEntity
from loguru import logger
import orjson, ray, numpy as np
from sm.misc.deser import deserialize_byte_lines, deserialize_lines
from sm.misc.deser import deserialize_lines
from tqdm import tqdm


Expand Down Expand Up @@ -232,7 +232,7 @@ def deserialize_np2(dat: bytes) -> np.ndarray:
pagerank_stat_outfile.write_bytes(
orjson.dumps(
{
"total": total,
"sum": total,
"len": size,
"mean": mean_pagerank,
"max": max_pagerank,
Expand Down
20 changes: 19 additions & 1 deletion kgdata/wikidata/extra_ent_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from functools import partial
from operator import itemgetter
from pathlib import Path
from typing import List, Literal, Union, cast, overload
from typing import List, Literal, Union, cast, overload, TypedDict

import orjson
from hugedict.prelude import (
Expand All @@ -23,6 +23,10 @@
from sm.misc.timer import Timer

EntAttr = Literal["label", "description", "aliases", "instanceof", "pagerank"]
PageRankStats = TypedDict(
"PageRankStats",
{"min": float, "max": float, "mean": float, "std": float, "sum": float, "len": int},
)


def get_multilingual_key(id: str, lang: str) -> str:
Expand Down Expand Up @@ -117,6 +121,14 @@ def get_entity_attr_db(
return db


def get_pagerank_stats(
dbfile: Union[Path, str],
) -> PageRankStats:
dbpath = Path(dbfile)
realdbpath = dbpath.parent / dbpath.stem / f"pagerank.db"
return orjson.loads((realdbpath / "pagerank_stats.json").read_bytes())


def build_extra_ent_db(
dbpath: Path, attr: EntAttr, lang: str = "en", compact: bool = True
):
Expand Down Expand Up @@ -190,6 +202,12 @@ def input_gen():
with Timer().watch_and_report("Creating SST files"):
if attr == "pagerank":
dataset = entity_pagerank(lang=lang)
# copy the statistics file to the temporary directory
dataset_dir = Path(dataset.file_pattern).parent
assert dataset_dir.exists()
statsfile = dataset_dir.parent / f"{dataset_dir.name}.json"
assert statsfile.exists()
shutil.copy2(statsfile, realdbpath / "pagerank_stats.json")
else:
dataset = entity_metadata(lang=lang)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "kgdata"
version = "3.1.1"
version = "3.2.0"
description = "Library to process dumps of knowledge graphs (Wikipedia, DBpedia, Wikidata)"
authors = ["Binh Vu <binh@toan2.com>"]
license = "MIT"
Expand Down

0 comments on commit 413c140

Please sign in to comment.