Skip to content
This repository has been archived by the owner on Oct 17, 2024. It is now read-only.

Commit

Permalink
Merge pull request #4 from storebrand/pages
Browse files Browse the repository at this point in the history
Load Pages
  • Loading branch information
hholgersen authored Nov 28, 2023
2 parents d790cd3 + 0a760cd commit 14e9f9b
Show file tree
Hide file tree
Showing 8 changed files with 651 additions and 345 deletions.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Built with the [Meltano Tap SDK](https://sdk.meltano.com) for Singer Taps.
| api_url | True | None | The url for the API service |
| lists | False | None | The name of the list to sync |
| files | False | None | Files to sync |
| pages | False | None | Whether or not to sync pages |
| client_id | False | None | Managed Identity Client ID |
| stream_maps | False | None | Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html). |
| stream_map_config | False | None | User-defined config values to be used within map expressions. |
Expand Down Expand Up @@ -58,6 +59,21 @@ Example config:
...
```

## Web pages

You can sync the content of sharepoint web pages, typically relevant for LLM/RAG type of use cases. The Microsoft Graph endpoint for pages is still in Beta, and does not work when logged in as a personal user. In order for it to work, you need to use a Managed Identity.

Example config:

```
...
config:
...
pages: true
...
```


<!--
Developer TODO: Update the below as needed to correctly describe the install procedure. For instance, if you do not have a PyPi repo, or if you want users to directly install from your git repo, you can modify this step as appropriate.
Expand Down
727 changes: 394 additions & 333 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@ keywords = [
license = "Apache 2.0"

[tool.poetry.dependencies]
python = "<3.14,>=3.7.1"
python = "<3.12,>=3.7.1"
requests = "^2.25.1"
singer-sdk = "^0.29.0"
azure-identity = "^1.11"
openpyxl = "^3.0.7"
xlrd = "^2.0.1"
selectolax = "0.3.17"

[tool.poetry.dev-dependencies]
pytest = "^6.2.5"
Expand Down
12 changes: 9 additions & 3 deletions tap_sharepointsites/file_handlers/csv_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import logging
import re

from tap_sharepointsites.utils import snakecase

LOGGER = logging.getLogger(__name__)


Expand All @@ -16,7 +14,13 @@ def __init__(self, textcontent, delimiter=","):
"""Initialize ExcelHandler."""
self.textcontent = textcontent
self.delimiter = delimiter
self.clean_colnames = clean_colnames

@staticmethod
def format_key(key):
"""Format key."""
formatted_key = re.sub(r"[^\w\s]", "", key)
formatted_key = re.sub(r"\s+", "_", formatted_key)
return formatted_key.lower()

def get_dictreader(self):
"""Read CSV file and return csv DictReader object for the file."""
Expand All @@ -27,4 +31,6 @@ def get_dictreader(self):
delimiter=self.delimiter,
)

dr.fieldnames = [self.format_key(key) for key in dr.fieldnames.copy()]

return dr
14 changes: 9 additions & 5 deletions tap_sharepointsites/file_handlers/excel_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@

import openpyxl

from tap_sharepointsites.utils import snakecase

LOGGER = logging.getLogger(__name__)


Expand Down Expand Up @@ -38,7 +36,8 @@ def fieldnames(self):
"""Return fieldnames."""
return [c.value for c in self.xlsheet[1]]

def generator_wrapper(self, reader):
@staticmethod
def generator_wrapper(reader):
"""Wrap a reader in a generator."""
header_row = None
for row in reader:
Expand All @@ -51,11 +50,16 @@ def generator_wrapper(self, reader):
header_cell = header_row[index]

formatted_key = header_cell.value

if not formatted_key:
formatted_key = "" # default to empty string for key

to_return[formatted_key] = (
# remove non-word, non-whitespace characters
formatted_key = re.sub(r"[^\w\s]", "", formatted_key)

# replace whitespace with underscores
formatted_key = re.sub(r"\s+", "_", formatted_key)

to_return[formatted_key.lower()] = (
str(cell.value) if cell.value is not None else ""
)

Expand Down
204 changes: 204 additions & 0 deletions tap_sharepointsites/pages_stream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
"""Stream for Pages - most relevant for LLM stuff."""

import datetime
import typing as t

import requests
from azure.identity import DefaultAzureCredential, ManagedIdentityCredential
from selectolax.parser import HTMLParser
from singer_sdk import metrics
from singer_sdk.typing import IntegerType, PropertiesList, Property, StringType

from tap_sharepointsites.client import sharepointsitesStream


class PagesStream(sharepointsitesStream):
"""Define custom stream."""

records_jsonpath = "$.value[*]"
replication_key = "lastModifiedDateTime"
primary_keys = ["_sdc_source_file", "_sdc_chunk_number"]

def __init__(self, *args, **kwargs):
"""Init Page Stream."""
self._header = None
# self.header = self._get_headers()
super().__init__(*args, **kwargs)

def _get_headers(self):
"""Get adhoc headers for request."""
ad_scope = "https://graph.microsoft.com/.default"

if self.config.get("client_id"):
creds = ManagedIdentityCredential(client_id=self.config["client_id"])
else:
creds = DefaultAzureCredential()

token = creds.get_token(ad_scope)

headers = {
"Authorization": f"Bearer {token.token}",
}

return headers

name = "pages"

@property
def url_base(self) -> str:
"""Return the API URL root, configurable via tap settings."""
return "https://graph.microsoft.com"

@property
def header(self):
"""Run header function."""
return self._get_headers()

@property
def path(self) -> str:
"""Return the API endpoint path, configurable via tap settings."""
base_url = f"/beta/sites/{self.site_id}/pages"

return base_url

@staticmethod
def simple_chunker(text: str, chunk_length: int) -> list:
"""Split a text into N chunks of a fixed size, leaving the remainder in the last chunk."""
text_array = text.split(" ")
text_length = len(text_array)
# chunk_length = text_length // num_chunks
num_chunks = text_length // chunk_length + 1
chunks = []

for i in range(num_chunks):
start = i * chunk_length
end = start + chunk_length if i < num_chunks - 1 else text_length
chunks.append(" ".join(text_array[start:end]))

return chunks

@property
def schema(self):
"""Return a schema object for this stream."""
schema = PropertiesList(
Property("title", StringType),
Property("content", StringType),
Property("eTag", StringType),
Property("id", StringType),
Property("lastModifiedDateTime", StringType),
Property("_sdc_source_id", StringType),
Property("_sdc_loaded_at", StringType()),
Property("_sdc_chunk_num", IntegerType()),
).to_dict()
return schema

@property
def site_id(self):
"""Return ID of specified Sharepoint Site."""
full_url = self.config.get("api_url")
response = requests.get(full_url, headers=self.header)
return response.json()["id"]

def parse_response(self, response: requests.Response, context) -> t.Iterable[dict]:
"""Parse the response and return an iterator of result records."""
resp_values = response.json()["value"]
files_since = (
self.get_starting_replication_key_value(context) or "1900-01-01T00:00:00Z"
)

for record in resp_values:
if record["lastModifiedDateTime"] > files_since:

page_element = self.get_content_for_page(record["id"])

chunks = self.simple_chunker(page_element, 3000)
for j, chunk in enumerate(chunks):
record = {
"title": record["title"],
"content": chunk,
"lastModifiedDateTime": record["lastModifiedDateTime"],
"_sdc_source_id": record["id"],
"_sdc_loaded_at": str(datetime.datetime.utcnow()),
"_sdc_chunk_num": j,
}

yield record

def get_content_for_page(self, id):
"""Get content for page."""
base_url = (
f"https://graph.microsoft.com/beta/sites/{self.site_id}/pages/"
f"{id}/microsoft.graph.sitepage/webparts"
)

page_content = requests.get(base_url, headers=self.header)
page_content.raise_for_status()

data = page_content.json()
htmls = "".join(
[
element.get("innerHtml")
for element in data["value"]
if element.get("innerHtml")
]
)

parsed_htmls = self.parse_html(htmls)

return parsed_htmls

def request_records(self, context) -> t.Iterable[dict]:
"""Request records from REST endpoint(s), returning response records.
If pagination is detected, pages will be recursed automatically.
Args:
context: Stream partition or context dictionary.
Yields
------
An item for every record in the response.
"""
paginator = self.get_new_paginator()

decorated_request = self.request_decorator(self._request)

with metrics.http_request_counter(self.name, self.path) as request_counter:
request_counter.context = context

while not paginator.finished:
prepared_request = self.prepare_request(
context,
next_page_token=paginator.current_value,
)
resp = decorated_request(prepared_request, context)
request_counter.increment()
self.update_sync_costs(prepared_request, resp, context)

yield from self.parse_response(resp, context)

paginator.advance(resp)

@staticmethod
def parse_html(html_string: str):
"""Parse html string and return decently formatted text."""
unwrap_tags = ["em", "strong", "b", "i", "span", "a", "code", "kbd"]
remove_tags = ["script", "style"]

parsed_html = HTMLParser(html_string)
for removed_tag in remove_tags:
for element in parsed_html.css(removed_tag):
element.decompose()

parsed_html.unwrap_tags(unwrap_tags)
html_texts = []
for node in parsed_html.css("*"):
# selectolax `strip=True` will merge unwrapped tag texts together
# so we need regular python strip()
node_text = node.text(deep=False, strip=False)
node_text = node_text.strip()
node_text = node_text.replace("\n", " ")
if node_text:
html_texts.append(node_text)
return "\n".join(html_texts)
16 changes: 14 additions & 2 deletions tap_sharepointsites/tap.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from tap_sharepointsites.file_stream import FilesStream
from tap_sharepointsites.list_stream import ListStream
from tap_sharepointsites.pages_stream import PagesStream


class Tapsharepointsites(Tap):
Expand Down Expand Up @@ -70,6 +71,12 @@ class Tapsharepointsites(Tap):
required=False,
description="Files to sync",
),
th.Property(
"pages",
th.BooleanType,
required=False,
description="Boolean, Whether or not to sync pages",
),
th.Property(
"client_id",
th.DateTimeType,
Expand Down Expand Up @@ -126,9 +133,14 @@ def discover_streams(self) -> List[Stream]:
else:
files_streams = []

all_streams = list_streams + files_streams
if self.config.get("pages"):
pages_streams = [PagesStream(tap=self)]
else:
pages_streams = []

all_streams = list_streams + files_streams + pages_streams

self.logger.info(f"Discovered {len(all_streams)} streams")
self.logger.debug(f"Discovered {len(all_streams)} streams")

return all_streams

Expand Down
4 changes: 3 additions & 1 deletion tap_sharepointsites/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""Utility functions for tap-sharepointsites."""

import re


def snakecase(name):
# Convert camelCase to snake_case
"""Convert a string to snake_case."""
name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name)

Expand Down

0 comments on commit 14e9f9b

Please sign in to comment.