Skip to content

Commit

Permalink
feat(cli): add excel loader example
Browse files Browse the repository at this point in the history
  • Loading branch information
sperka committed Nov 8, 2023
1 parent e1c730a commit cad3243
Show file tree
Hide file tree
Showing 11 changed files with 524 additions and 37 deletions.
8 changes: 8 additions & 0 deletions packages/galileo-cli/.projen/deps.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions packages/galileo-cli/.projen/tasks.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
/*! Copyright [Amazon.com](http://amazon.com/), Inc. or its affiliates. All Rights Reserved.
PDX-License-Identifier: Apache-2.0 */

import { parseCsv } from "./parser-common";
import { DocumentMetadata, IMetadataProvider } from "../../src";
import { parseCsv } from './parser-common';
import { DocumentMetadata, IMetadataProvider } from '../../src';

export class MetadataProvider implements IMetadataProvider {
async getMetadata(): Promise<string | DocumentMetadata> {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
/*! Copyright [Amazon.com](http://amazon.com/), Inc. or its affiliates. All Rights Reserved.
PDX-License-Identifier: Apache-2.0 */

import * as fs from "node:fs";
import * as path from "node:path";
import { parseCsv } from "./parser-common";
import { DocumentMetadata, IMetadataProvider } from "../../src";
import * as fs from 'node:fs';
import * as path from 'node:path';
import { parseCsv } from './parser-common';
import { DocumentMetadata, IMetadataProvider } from '../../src';

export class MetadataProvider implements IMetadataProvider {
async getMetadata(): Promise<string | DocumentMetadata> {
// build the metadata
const docMetadata = parseCsv();

// save it as a metadata.json file
const outputDir = path.join(__dirname, "generated");
const outputFile = path.join(outputDir, "metadata.json");
const outputDir = path.join(__dirname, 'generated');
const outputFile = path.join(outputDir, 'metadata.json');

// create generated folder if doesn't exist
fs.mkdirSync(outputDir, { recursive: true });

// save into a file
fs.writeFileSync(outputFile, JSON.stringify(docMetadata, null, 2), {
encoding: "utf-8",
encoding: 'utf-8',
});

// return absolute path
Expand Down
29 changes: 12 additions & 17 deletions packages/galileo-cli/examples/csv-loader/parser-common.ts
Original file line number Diff line number Diff line change
@@ -1,35 +1,30 @@
/*! Copyright [Amazon.com](http://amazon.com/), Inc. or its affiliates. All Rights Reserved.
PDX-License-Identifier: Apache-2.0 */

import * as fs from "node:fs";
import * as path from "node:path";
import { parse } from "csv-parse/sync";
import { DocumentMetadata } from "../../src";
import * as fs from 'node:fs';
import * as path from 'node:path';
import { parse } from 'csv-parse/sync';
import { DocumentMetadata } from '../../src';

export const parseCsv = (): DocumentMetadata => {
try {
const filename = "example.csv";
const filename = 'example.csv';
const filepath = path.join(__dirname, filename);
const csvFileContent = fs.readFileSync(filepath, { encoding: "utf-8" });
const csvFileContent = fs.readFileSync(filepath, { encoding: 'utf-8' });

const rows: any[] = parse(csvFileContent, {
columns: [
{ name: "service" },
{ name: "description" },
{ name: "serviceType" },
{ name: "freeTierType" },
],
delimiter: ",",
encoding: "utf-8",
columns: [{ name: 'service' }, { name: 'description' }, { name: 'serviceType' }, { name: 'freeTierType' }],
delimiter: ',',
encoding: 'utf-8',
from_line: 2, // don't parse header line
});

const docMetadata: DocumentMetadata = {
// leave it empty as we're not using any files
// if you want to use files, use absolute path
rootDir: "",
rootDir: '',
metadata: {
domain: "aws-services",
domain: 'aws-services',
},
documents: {},
};
Expand All @@ -41,7 +36,7 @@ export const parseCsv = (): DocumentMetadata => {
metadata: {
serviceType: row.serviceType,
// only include this metadata entry if value present
freeTierType: row.freeTierType === "" ? undefined : row.freeTierType,
freeTierType: row.freeTierType === '' ? undefined : row.freeTierType,
},
};
}
Expand Down
Binary file not shown.
193 changes: 193 additions & 0 deletions packages/galileo-cli/examples/excel-loader/excel-loader.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
/*! Copyright [Amazon.com](http://amazon.com/), Inc. or its affiliates. All Rights Reserved.
PDX-License-Identifier: Apache-2.0 */
import * as fs from 'node:fs';
import * as path from 'node:path';
import ExcelJs from 'exceljs';
import { isEmpty } from 'lodash';
import { DocumentMetadata } from '../../src';

export interface ExcelLoaderOptions {
/**
* Path to the excel file.
*/
readonly execFilepath: string;

/**
* The domain to use in the root metadata object.
* @default "my-example-domain"
*/
readonly domain?: string;

/**
* Additional metadata to use in the root metadata object.
*/
readonly rootMetadata: Record<string, string>;

/**
* The name of the key to add the worksheet's name as metadata.
*
* E.g.: if the value of this parameter is "category" and the worksheet's name is "Sheet1",
* it will be added as `category: Sheet1` into the document's metadata object.
*
* If it's not set, the worksheet's name won't be added as a metadata.
*/
readonly worksheetNameMetadataKey?: string;

/**
* Whether to save the document metadata as a `metadata.json` file.
* If it's `true`, the metadata will be saved next to the excel file and its path is returned;
* otherwise, the `DocumentMetadata` object is returned.
*/
readonly saveMetadata?: boolean;

/**
* The index of the header row. This is 1-based (i.e.: excel row indexing)
* @default 1
*/
readonly headerRowIndex?: number;

/**
* The start index of the rows containing data. This is 1-based (i.e.: excel row indexing)
* @default 2
*/
readonly dataRowStartIndex?: number;

/**
* Column selector to include data in the `pageContent`.
* Use column index (1-based) or column letter.
*
* The content will be added to `pageContent` in the order of the selector items,
* using `headerValue`: `cellValue` lines
*/
readonly pageContentColumnSelector: (string | number)[];

/**
* Column selector to use to include data in the entry's `metadata`.
* Use column index (1-based) or column letter.
*
* This will be added to the entry's `metadata`
* in the format of `headerValue`: `cellValue`
*
* @default undefined
*/
readonly entryMetadataColumnSelector?: (string | number)[];

/**
* Whether to use base64 encoded metadata in the entries' metadata section.
* Set this to `true` if you have metadata that uses non-US-ASCII characters (e.g.: Vietnamese chars)
*
* @default false
*/
readonly useBase64EncodedEntryMetadata?: boolean;
}

export class ExcelLoader {
readonly options: ExcelLoaderOptions;

constructor(options: ExcelLoaderOptions) {
this.options = options;
}

async process(): Promise<DocumentMetadata | string> {
const documentMetadata: DocumentMetadata = {
rootDir: '',
metadata: {
domain: this.options.domain ?? 'my-example-domain',
...this.options.rootMetadata,
},
documents: {},
};
const excelFilename = path.basename(this.options.execFilepath);

const headerRowIndex = this.options.headerRowIndex ?? 1;
const dataRowStartIndex = this.options.dataRowStartIndex ?? 2;

const workbook = new ExcelJs.Workbook();
await workbook.xlsx.readFile(this.options.execFilepath);

for (const worksheet of workbook.worksheets) {
console.log(`Processing worksheet ${worksheet.name}`);

const headerRow = worksheet.getRow(headerRowIndex);
const keys: Record<string, string> = {};

this.options.pageContentColumnSelector.forEach((columnNameOrIdx) => {
keys[`${columnNameOrIdx}`] = headerRow.getCell(columnNameOrIdx).toString();
});

// if there are no data rows, skip processing
if (worksheet.rowCount <= dataRowStartIndex) {
continue;
}

const lastrowIdx = worksheet.rowCount;

for (let rowIdx = 2; rowIdx <= lastrowIdx; rowIdx++) {
const row = worksheet.getRow(rowIdx);

const content: string[] = [];
this.options.pageContentColumnSelector.forEach((columnNameOrIdx) => {
const val = row.getCell(columnNameOrIdx).toString();

if (!isEmpty(val)) {
content.push(`${keys[`${columnNameOrIdx}`]}: ${val}`);
}
});

const entryMeta: Record<string, string> = {};
if (this.options.entryMetadataColumnSelector) {
this.options.entryMetadataColumnSelector.forEach((columnNameOrIdx) => {
const val = row.getCell(columnNameOrIdx).toString();

if (!isEmpty(val)) {
entryMeta[keys[`${columnNameOrIdx}`]] = val;
}
});
}

let entryMetadata: Record<string, any> = {};
if (this.options.worksheetNameMetadataKey) {
entryMetadata[this.options.worksheetNameMetadataKey] = worksheet.name;
}

if (this.options.useBase64EncodedEntryMetadata) {
entryMetadata['json-base64'] = Buffer.from(JSON.stringify(entryMeta)).toString('base64');
} else {
entryMetadata = {
...entryMetadata,
...entryMeta,
};
}

const pageContent = content.join('\n');
if (isEmpty(pageContent)) {
continue;
}

documentMetadata.documents[`${excelFilename}/${worksheet.name}/row-${rowIdx}`] = {
metadata: entryMetadata,
pageContent: content.join('\n'),
};
}
}

if (this.options.saveMetadata) {
// save it as a metadata.json file
const outputDirPath = path.join(path.dirname(this.options.execFilepath), 'generated');
if (!fs.existsSync(outputDirPath)) {
fs.mkdirSync(outputDirPath);
}
const outputFilepath = path.join(outputDirPath, 'metadata.json');

// save into a file
fs.writeFileSync(outputFilepath, JSON.stringify(documentMetadata, null, 2), {
encoding: 'utf-8',
});

// return absolute path
return outputFilepath;
}

return documentMetadata;
}
}
45 changes: 45 additions & 0 deletions packages/galileo-cli/examples/excel-loader/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*! Copyright [Amazon.com](http://amazon.com/), Inc. or its affiliates. All Rights Reserved.
PDX-License-Identifier: Apache-2.0 */
import chalk from 'chalk';
import prompts from 'prompts';

import { ExcelLoader } from './excel-loader';
import { DocumentMetadata, IMetadataProvider } from '../../src';

export class MetadataProvider implements IMetadataProvider {
async getMetadata(): Promise<string | DocumentMetadata> {
// build the metadata
const { excelPath } = await prompts({
type: 'text',
name: 'excelPath',
message: `Enter the path to the excel file (${chalk.grey(`CWD: ${process.cwd()}`)}):`,
});

const excelLoader = new ExcelLoader({
execFilepath: excelPath,
domain: 'aws-services',
rootMetadata: {
collection: 'example-collection',
},
worksheetNameMetadataKey: 'serviceType',
saveMetadata: true,
headerRowIndex: 1,
dataRowStartIndex: 2,

pageContentColumnSelector: [
'A', // service
2, // description -- note, column index
'D', // url
'C', // free tier type
],

entryMetadataColumnSelector: [
'D', // url
3, // free tier type -- note, column index
],
useBase64EncodedEntryMetadata: false,
});
const result = await excelLoader.process();
return result;
}
}
2 changes: 2 additions & 0 deletions packages/galileo-cli/package.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit cad3243

Please sign in to comment.