Skip to content
This repository has been archived by the owner on Oct 17, 2024. It is now read-only.

Better column name handling #3

Merged
merged 14 commits into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ The file configuration accepts an array of objects, with keys:
- `folder`: Subfolder where the files are located
- `file_type`: Type (format) of file to load, either `csv` or `excel`.
- `delimiter`: Field delimiter for CSV files. default `,`
- `clean_colnames`: Whether to convert column names to snake_case. default `false`

Example config:

Expand All @@ -53,6 +54,7 @@ Example config:
file_pattern: employees_.*\.csv
folder: hr_data/raw
file_type: csv
clean_colnames: true
...
```

Expand Down
12 changes: 3 additions & 9 deletions tap_sharepointsites/file_handlers/csv_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import logging
import re

from tap_sharepointsites.utils import snakecase

LOGGER = logging.getLogger(__name__)


Expand All @@ -14,13 +16,7 @@ def __init__(self, textcontent, delimiter=","):
"""Initialize ExcelHandler."""
self.textcontent = textcontent
self.delimiter = delimiter

@staticmethod
def format_key(key):
"""Format key."""
formatted_key = re.sub(r"[^\w\s]", "", key)
formatted_key = re.sub(r"\s+", "_", formatted_key)
return formatted_key.lower()
self.clean_colnames = clean_colnames

def get_dictreader(self):
"""Read CSV file and return csv DictReader object for the file."""
Expand All @@ -31,6 +27,4 @@ def get_dictreader(self):
delimiter=self.delimiter,
)

dr.fieldnames = [self.format_key(key) for key in dr.fieldnames.copy()]

return dr
14 changes: 5 additions & 9 deletions tap_sharepointsites/file_handlers/excel_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import openpyxl

from tap_sharepointsites.utils import snakecase

LOGGER = logging.getLogger(__name__)


Expand Down Expand Up @@ -36,8 +38,7 @@ def fieldnames(self):
"""Return fieldnames."""
return [c.value for c in self.xlsheet[1]]

@staticmethod
def generator_wrapper(reader):
def generator_wrapper(self, reader):
"""Wrap a reader in a generator."""
header_row = None
for row in reader:
Expand All @@ -50,16 +51,11 @@ def generator_wrapper(reader):
header_cell = header_row[index]

formatted_key = header_cell.value

if not formatted_key:
formatted_key = "" # default to empty string for key

# remove non-word, non-whitespace characters
formatted_key = re.sub(r"[^\w\s]", "", formatted_key)

# replace whitespace with underscores
formatted_key = re.sub(r"\s+", "_", formatted_key)

to_return[formatted_key.lower()] = (
to_return[formatted_key] = (
str(cell.value) if cell.value is not None else ""
)

Expand Down
19 changes: 10 additions & 9 deletions tap_sharepointsites/file_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from tap_sharepointsites.client import sharepointsitesStream
from tap_sharepointsites.file_handlers.csv_handler import CSVHandler
from tap_sharepointsites.file_handlers.excel_handler import ExcelHandler
from tap_sharepointsites.utils import snakecase


class FilesStream(sharepointsitesStream):
Expand Down Expand Up @@ -126,6 +127,10 @@ def parse_response(self, response: requests.Response, context) -> t.Iterable[dic
raise Exception(f"File type { filetype_name } not supported (yet)")

for i, row in enumerate(dr):

if self.file_config.get("clean_colnames", False):
row = {snakecase(k): v for k, v in row.items()}

row.update(
{
"_sdc_source_file": record["name"],
Expand Down Expand Up @@ -156,15 +161,11 @@ def schema(self):
dr = ExcelHandler(file)

properties = {}
formatted_key = [
re.sub(r"[^\w\s]", "", formatted_key)
for formatted_key in dr.fieldnames
]
formatted_key = [
re.sub(r"\s+", "_", formatted_key)
for formatted_key in formatted_key
]
fieldnames = [formatted_key.lower() for formatted_key in formatted_key]

fieldnames = [name for name in dr.fieldnames]

if self.file_config.get("clean_colnames", False):
fieldnames = [snakecase(name) for name in fieldnames]

extra_cols = [
"_sdc_source_file",
Expand Down
7 changes: 7 additions & 0 deletions tap_sharepointsites/tap.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ class Tapsharepointsites(Tap):
required=False,
description="For CSV files: the delimiter to use",
),
th.Property(
"clean_colnames",
th.BooleanType,
required=False,
default=False,
description="Replace special characters and convert to snakecase",
),
),
),
required=False,
Expand Down
15 changes: 15 additions & 0 deletions tap_sharepointsites/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import re


def snakecase(name):
# Convert camelCase to snake_case
name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name)

# Replace any non-alphanumeric characters with underscores
name = re.sub(r"[^a-zA-Z0-9_]+", "_", name)

# Replace any sequence of multiple underscores with a single underscore
name = re.sub(r"_{2,}", "_", name)

return name.lower()
Loading