Merge pull request #4 from storebrand/pages

Load Pages
storebrand · Nov 28, 2023 · 14e9f9b · 14e9f9b
2 parents d790cd3 + 0a760cd
commit 14e9f9b
Show file tree

Hide file tree

Showing 8 changed files with 651 additions and 345 deletions.
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ Built with the [Meltano Tap SDK](https://sdk.meltano.com) for Singer Taps.
 | api_url             | True     | None    | The url for the API service |
 | lists               | False    | None    | The name of the list to sync |
 | files               | False    | None    | Files to sync |
+| pages               | False    | None    | Whether or not to sync pages |
 | client_id           | False    | None    | Managed Identity Client ID |
 | stream_maps         | False    | None    | Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html). |
 | stream_map_config   | False    | None    | User-defined config values to be used within map expressions. |
@@ -58,6 +59,21 @@ Example config:
   ...
 ```
 
+## Web pages
+
+You can sync the content of sharepoint web pages, typically relevant for LLM/RAG type of use cases. The Microsoft Graph endpoint for pages is still in Beta, and does not work when logged in as a personal user. In order for it to work, you need to use a Managed Identity.
+
+Example config:
+
+```
+...
+  config:
+    ...
+    pages: true
+  ...
+```
+
+
 <!--
 
 Developer TODO: Update the below as needed to correctly describe the install procedure. For instance, if you do not have a PyPi repo, or if you want users to directly install from your git repo, you can modify this step as appropriate.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,12 +10,13 @@ keywords = [
 license = "Apache 2.0"
 
 [tool.poetry.dependencies]
-python = "<3.14,>=3.7.1"
+python = "<3.12,>=3.7.1"
 requests = "^2.25.1"
 singer-sdk = "^0.29.0"
 azure-identity = "^1.11"
 openpyxl = "^3.0.7"
 xlrd = "^2.0.1"
+selectolax = "0.3.17"
 
 [tool.poetry.dev-dependencies]
 pytest = "^6.2.5"

diff --git a/tap_sharepointsites/file_handlers/csv_handler.py b/tap_sharepointsites/file_handlers/csv_handler.py
@@ -4,8 +4,6 @@
 import logging
 import re
 
-from tap_sharepointsites.utils import snakecase
-
 LOGGER = logging.getLogger(__name__)
 
 
@@ -16,7 +14,13 @@ def __init__(self, textcontent, delimiter=","):
         """Initialize ExcelHandler."""
         self.textcontent = textcontent
         self.delimiter = delimiter
-        self.clean_colnames = clean_colnames
+
+    @staticmethod
+    def format_key(key):
+        """Format key."""
+        formatted_key = re.sub(r"[^\w\s]", "", key)
+        formatted_key = re.sub(r"\s+", "_", formatted_key)
+        return formatted_key.lower()
 
     def get_dictreader(self):
         """Read CSV file and return csv DictReader object for the file."""
@@ -27,4 +31,6 @@ def get_dictreader(self):
             delimiter=self.delimiter,
         )
 
+        dr.fieldnames = [self.format_key(key) for key in dr.fieldnames.copy()]
+
         return dr
diff --git a/tap_sharepointsites/file_handlers/excel_handler.py b/tap_sharepointsites/file_handlers/excel_handler.py
@@ -6,8 +6,6 @@
 
 import openpyxl
 
-from tap_sharepointsites.utils import snakecase
-
 LOGGER = logging.getLogger(__name__)
 
 
@@ -38,7 +36,8 @@ def fieldnames(self):
         """Return fieldnames."""
         return [c.value for c in self.xlsheet[1]]
 
-    def generator_wrapper(self, reader):
+    @staticmethod
+    def generator_wrapper(reader):
         """Wrap a reader in a generator."""
         header_row = None
         for row in reader:
@@ -51,11 +50,16 @@ def generator_wrapper(self, reader):
                 header_cell = header_row[index]
 
                 formatted_key = header_cell.value
-
                 if not formatted_key:
                     formatted_key = ""  # default to empty string for key
 
-                to_return[formatted_key] = (
+                # remove non-word, non-whitespace characters
+                formatted_key = re.sub(r"[^\w\s]", "", formatted_key)
+
+                # replace whitespace with underscores
+                formatted_key = re.sub(r"\s+", "_", formatted_key)
+
+                to_return[formatted_key.lower()] = (
                     str(cell.value) if cell.value is not None else ""
                 )
 

diff --git a/tap_sharepointsites/pages_stream.py b/tap_sharepointsites/pages_stream.py
@@ -0,0 +1,204 @@
+"""Stream for Pages - most relevant for LLM stuff."""
+
+import datetime
+import typing as t
+
+import requests
+from azure.identity import DefaultAzureCredential, ManagedIdentityCredential
+from selectolax.parser import HTMLParser
+from singer_sdk import metrics
+from singer_sdk.typing import IntegerType, PropertiesList, Property, StringType
+
+from tap_sharepointsites.client import sharepointsitesStream
+
+
+class PagesStream(sharepointsitesStream):
+    """Define custom stream."""
+
+    records_jsonpath = "$.value[*]"
+    replication_key = "lastModifiedDateTime"
+    primary_keys = ["_sdc_source_file", "_sdc_chunk_number"]
+
+    def __init__(self, *args, **kwargs):
+        """Init Page Stream."""
+        self._header = None
+        # self.header = self._get_headers()
+        super().__init__(*args, **kwargs)
+
+    def _get_headers(self):
+        """Get adhoc headers for request."""
+        ad_scope = "https://graph.microsoft.com/.default"
+
+        if self.config.get("client_id"):
+            creds = ManagedIdentityCredential(client_id=self.config["client_id"])
+        else:
+            creds = DefaultAzureCredential()
+
+        token = creds.get_token(ad_scope)
+
+        headers = {
+            "Authorization": f"Bearer {token.token}",
+        }
+
+        return headers
+
+    name = "pages"
+
+    @property
+    def url_base(self) -> str:
+        """Return the API URL root, configurable via tap settings."""
+        return "https://graph.microsoft.com"
+
+    @property
+    def header(self):
+        """Run header function."""
+        return self._get_headers()
+
+    @property
+    def path(self) -> str:
+        """Return the API endpoint path, configurable via tap settings."""
+        base_url = f"/beta/sites/{self.site_id}/pages"
+
+        return base_url
+
+    @staticmethod
+    def simple_chunker(text: str, chunk_length: int) -> list:
+        """Split a text into N chunks of a fixed size, leaving the remainder in the last chunk."""
+        text_array = text.split(" ")
+        text_length = len(text_array)
+        # chunk_length = text_length // num_chunks
+        num_chunks = text_length // chunk_length + 1
+        chunks = []
+
+        for i in range(num_chunks):
+            start = i * chunk_length
+            end = start + chunk_length if i < num_chunks - 1 else text_length
+            chunks.append(" ".join(text_array[start:end]))
+
+        return chunks
+
+    @property
+    def schema(self):
+        """Return a schema object for this stream."""
+        schema = PropertiesList(
+            Property("title", StringType),
+            Property("content", StringType),
+            Property("eTag", StringType),
+            Property("id", StringType),
+            Property("lastModifiedDateTime", StringType),
+            Property("_sdc_source_id", StringType),
+            Property("_sdc_loaded_at", StringType()),
+            Property("_sdc_chunk_num", IntegerType()),
+        ).to_dict()
+        return schema
+
+    @property
+    def site_id(self):
+        """Return ID of specified Sharepoint Site."""
+        full_url = self.config.get("api_url")
+        response = requests.get(full_url, headers=self.header)
+        return response.json()["id"]
+
+    def parse_response(self, response: requests.Response, context) -> t.Iterable[dict]:
+        """Parse the response and return an iterator of result records."""
+        resp_values = response.json()["value"]
+        files_since = (
+            self.get_starting_replication_key_value(context) or "1900-01-01T00:00:00Z"
+        )
+
+        for record in resp_values:
+            if record["lastModifiedDateTime"] > files_since:
+
+                page_element = self.get_content_for_page(record["id"])
+
+                chunks = self.simple_chunker(page_element, 3000)
+                for j, chunk in enumerate(chunks):
+                    record = {
+                        "title": record["title"],
+                        "content": chunk,
+                        "lastModifiedDateTime": record["lastModifiedDateTime"],
+                        "_sdc_source_id": record["id"],
+                        "_sdc_loaded_at": str(datetime.datetime.utcnow()),
+                        "_sdc_chunk_num": j,
+                    }
+
+                    yield record
+
+    def get_content_for_page(self, id):
+        """Get content for page."""
+        base_url = (
+            f"https://graph.microsoft.com/beta/sites/{self.site_id}/pages/"
+            f"{id}/microsoft.graph.sitepage/webparts"
+        )
+
+        page_content = requests.get(base_url, headers=self.header)
+        page_content.raise_for_status()
+
+        data = page_content.json()
+        htmls = "".join(
+            [
+                element.get("innerHtml")
+                for element in data["value"]
+                if element.get("innerHtml")
+            ]
+        )
+
+        parsed_htmls = self.parse_html(htmls)
+
+        return parsed_htmls
+
+    def request_records(self, context) -> t.Iterable[dict]:
+        """Request records from REST endpoint(s), returning response records.
+
+        If pagination is detected, pages will be recursed automatically.
+
+        Args:
+            context: Stream partition or context dictionary.
+
+        Yields
+        ------
+            An item for every record in the response.
+
+        """
+        paginator = self.get_new_paginator()
+
+        decorated_request = self.request_decorator(self._request)
+
+        with metrics.http_request_counter(self.name, self.path) as request_counter:
+            request_counter.context = context
+
+            while not paginator.finished:
+                prepared_request = self.prepare_request(
+                    context,
+                    next_page_token=paginator.current_value,
+                )
+                resp = decorated_request(prepared_request, context)
+                request_counter.increment()
+                self.update_sync_costs(prepared_request, resp, context)
+
+                yield from self.parse_response(resp, context)
+
+                paginator.advance(resp)
+
+    @staticmethod
+    def parse_html(html_string: str):
+        """Parse html string and return decently formatted text."""
+        unwrap_tags = ["em", "strong", "b", "i", "span", "a", "code", "kbd"]
+        remove_tags = ["script", "style"]
+
+        parsed_html = HTMLParser(html_string)
+        for removed_tag in remove_tags:
+            for element in parsed_html.css(removed_tag):
+                element.decompose()
+
+        parsed_html.unwrap_tags(unwrap_tags)
+        html_texts = []
+        for node in parsed_html.css("*"):
+            # selectolax `strip=True` will merge unwrapped tag texts together
+            # so we need regular python strip()
+            node_text = node.text(deep=False, strip=False)
+            node_text = node_text.strip()
+            node_text = node_text.replace("\n", " ")
+            if node_text:
+                html_texts.append(node_text)
+        return "\n".join(html_texts)
diff --git a/tap_sharepointsites/tap.py b/tap_sharepointsites/tap.py
@@ -9,6 +9,7 @@
 
 from tap_sharepointsites.file_stream import FilesStream
 from tap_sharepointsites.list_stream import ListStream
+from tap_sharepointsites.pages_stream import PagesStream
 
 
 class Tapsharepointsites(Tap):
@@ -70,6 +71,12 @@ class Tapsharepointsites(Tap):
             required=False,
             description="Files to sync",
         ),
+        th.Property(
+            "pages",
+            th.BooleanType,
+            required=False,
+            description="Boolean, Whether or not to sync pages",
+        ),
         th.Property(
             "client_id",
             th.DateTimeType,
@@ -126,9 +133,14 @@ def discover_streams(self) -> List[Stream]:
         else:
             files_streams = []
 
-        all_streams = list_streams + files_streams
+        if self.config.get("pages"):
+            pages_streams = [PagesStream(tap=self)]
+        else:
+            pages_streams = []
+
+        all_streams = list_streams + files_streams + pages_streams
 
-        self.logger.info(f"Discovered {len(all_streams)} streams")
+        self.logger.debug(f"Discovered {len(all_streams)} streams")
 
         return all_streams
 

diff --git a/tap_sharepointsites/utils.py b/tap_sharepointsites/utils.py
@@ -1,8 +1,10 @@
+"""Utility functions for tap-sharepointsites."""
+
 import re
 
 
 def snakecase(name):
-    # Convert camelCase to snake_case
+    """Convert a string to snake_case."""
     name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
     name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name)