diff --git a/README.md b/README.md index b34b398..a919411 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,8 @@ Play with Data Formulator with one of the following options: Data Formulator will be automatically opened in the browser at [http://localhost:5000](http://localhost:5000). + *Update: you can specify the port number (e.g., 8080) by `python -m data_formulator --port 8080` if the default port is occupied.* + - **Option 2: Codespaces (5 minutes)** You can also run Data Formulator in Codespaces; we have everything pre-configured. For more details, see [CODESPACES.md](CODESPACES.md). diff --git a/package.json b/package.json index aa428cf..4640e6d 100644 --- a/package.json +++ b/package.json @@ -10,6 +10,8 @@ "@mui/icons-material": "^5.14.0", "@mui/material": "^5.6.0", "@reduxjs/toolkit": "^1.8.6", + "@types/dompurify": "^3.0.5", + "@types/validator": "^13.12.2", "ag-grid-community": "^32.0.2", "ag-grid-enterprise": "^32.0.2", "ag-grid-react": "^32.0.2", diff --git a/py-src/data_formulator/agents/agent_data_clean.py b/py-src/data_formulator/agents/agent_data_clean.py index 8674d30..8cecbec 100644 --- a/py-src/data_formulator/agents/agent_data_clean.py +++ b/py-src/data_formulator/agents/agent_data_clean.py @@ -45,6 +45,8 @@ - the csv table should have the same number of cells for each line, according to the title. If there are some rows with missing values, patch them with empty cells. - if the raw data has some rows that do not belong to the table, also remove them (e.g., subtitles in between rows) - if the header row misses some columns, add their corresponding column names. E.g., when the header doesn't have an index column, but every row has an index value, add the missing column header. +* clean up messy column names: + - if the column name contains special characters like "*", "?", "#", "." remove them. * clean up columns with messy information - if a column is number but some cells has annotations like "*" "?" or brackets, clean them up. - if a column is number but has units like ($, %, s), convert them to number (make sure unit conversion is correct when multiple units exist like minute and second) and include unit in the header. @@ -80,7 +82,7 @@ def __init__(self, client, model): self.model = model self.client = client - def run(self, content_type, raw_data): + def run(self, content_type, raw_data, image_cleaning_instruction): """derive a new concept based on the raw input data """ @@ -93,6 +95,12 @@ def run(self, content_type, raw_data): }] } elif content_type == "image": + # add additional cleaning instructions if provided + if image_cleaning_instruction: + cleaning_prompt = f"\n\n[CLEANING INSTRUCTION]\n\n{image_cleaning_instruction}\n\n" + else: + cleaning_prompt = "" + user_prompt = { 'role': 'user', 'content': [ { @@ -107,7 +115,7 @@ def run(self, content_type, raw_data): }, { 'type': 'text', - 'text': '''[OUTPUT]\n\n''' + 'text': f'''{cleaning_prompt}[OUTPUT]\n\n''' }, ] } diff --git a/py-src/data_formulator/app.py b/py-src/data_formulator/app.py index b1f4d19..032edef 100644 --- a/py-src/data_formulator/app.py +++ b/py-src/data_formulator/app.py @@ -295,7 +295,7 @@ def clean_data_request(): agent = DataCleanAgent(client=client, model=model) - candidates = agent.run(content['content_type'], content["raw_data"]) + candidates = agent.run(content['content_type'], content["raw_data"], content["image_cleaning_instruction"]) candidates = [c for c in candidates if c['status'] == 'ok'] diff --git a/pyproject.toml b/pyproject.toml index 1995cdc..d3de308 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "data_formulator" -version = "0.1.3b" +version = "0.1.3c" requires-python = ">=3.9" authors = [ diff --git a/src/data/utils.ts b/src/data/utils.ts index e13f0fb..10f3df1 100644 --- a/src/data/utils.ts +++ b/src/data/utils.ts @@ -66,6 +66,10 @@ export const createTableFromFromObjectArray = (title: string, values: any[], der } return newName; } + // clean up messy column names + if (name && name.includes(".")) { + return name.replace(".", "_"); + } return name; }) diff --git a/src/views/TableSelectionView.tsx b/src/views/TableSelectionView.tsx index e9a23bb..fb84a0f 100644 --- a/src/views/TableSelectionView.tsx +++ b/src/views/TableSelectionView.tsx @@ -417,6 +417,7 @@ export const TableCopyDialogV2: React.FC = ({ buttonElemen const [tableName, setTableName] = useState(""); const [tableContent, setTableContent] = useState(""); + const [imageCleaningInstr, setImageCleaningInstr] = useState(""); const [tableContentType, setTableContentType] = useState<'text' | 'image'>('text'); const [cleaningInProgress, setCleaningInProgress] = useState(false); @@ -476,6 +477,7 @@ export const TableCopyDialogV2: React.FC = ({ buttonElemen token: token, content_type: tableContentType, raw_data: tableContent, + image_cleaning_instruction: imageCleaningInstr, model: activeModel }), }; @@ -652,27 +654,33 @@ export const TableCopyDialogV2: React.FC = ({ buttonElemen label="data content" variant="outlined" multiline minRows={15} /> : - - {cleaningInProgress ? : ""} - { - setTableContent(""); - setTableContentType("text"); - }} - > - - - {validator.isURL(tableContent) || validator.isDataURI(tableContent) ? ( - the image is corrupted, please try again. - ) : ( - Invalid image data - )} + + + {cleaningInProgress ? : ""} + { + setTableContent(""); + setTableContentType("text"); + setImageCleaningInstr(""); + }} + > + + + {validator.isURL(tableContent) || validator.isDataURI(tableContent) ? ( + the image is corrupted, please try again. + ) : ( + Invalid image data + )} + + { setImageCleaningInstr(event.target.value); }} + variant="standard" placeholder='additional cleaning instructions' /> ) } @@ -708,6 +716,7 @@ export const TableCopyDialogV2: React.FC = ({ buttonElemen {"upload"} + ; return <> diff --git a/yarn.lock b/yarn.lock index cdc98f3..aff1cee 100644 --- a/yarn.lock +++ b/yarn.lock @@ -968,6 +968,13 @@ "@types/d3-transition" "*" "@types/d3-zoom" "*" +"@types/dompurify@^3.0.5": + version "3.0.5" + resolved "https://registry.npmjs.org/@types/dompurify/-/dompurify-3.0.5.tgz#02069a2fcb89a163bacf1a788f73cb415dd75cb7" + integrity sha512-1Wg0g3BtQF7sSb27fJQAKck1HECM6zV1EB66j8JH9i3LCjYabJa0FSdiSgsD5K/RbrsR0SiraKacLB+T8ZVYAg== + dependencies: + "@types/trusted-types" "*" + "@types/estree@1.0.5", "@types/estree@^1.0.0": version "1.0.5" resolved "https://registry.npmjs.org/@types/estree/-/estree-1.0.5.tgz#a6ce3e556e00fd9895dd872dd172ad0d4bd687f4" @@ -1056,11 +1063,21 @@ resolved "https://registry.npmjs.org/@types/scheduler/-/scheduler-0.16.3.tgz#cef09e3ec9af1d63d2a6cc5b383a737e24e6dcf5" integrity sha512-5cJ8CB4yAx7BH1oMvdU0Jh9lrEXyPkar6F9G/ERswkCuvP4KQZfZkSjcMbAICCpQTN4OuZn8tz0HiKv9TGZgrQ== +"@types/trusted-types@*": + version "2.0.7" + resolved "https://registry.npmjs.org/@types/trusted-types/-/trusted-types-2.0.7.tgz#baccb07a970b91707df3a3e8ba6896c57ead2d11" + integrity sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw== + "@types/use-sync-external-store@^0.0.3": version "0.0.3" resolved "https://registry.npmjs.org/@types/use-sync-external-store/-/use-sync-external-store-0.0.3.tgz" integrity sha512-EwmlvuaxPNej9+T4v5AuBPJa2x2UOJVdjCtDHgcDqitUeOtjnJKJ+apYjVcAoBEMjKW1VVFGZLUb5+qqa09XFA== +"@types/validator@^13.12.2": + version "13.12.2" + resolved "https://registry.npmjs.org/@types/validator/-/validator-13.12.2.tgz#760329e756e18a4aab82fc502b51ebdfebbe49f5" + integrity sha512-6SlHBzUW8Jhf3liqrGGXyTJSIFe4nqlJ5A5KaMZ2l/vbM3Wh3KSybots/wfWVzNLK4D1NZluDlSQIbIEPx6oyA== + "@vitejs/plugin-react-swc@^3.7.0": version "3.7.0" resolved "https://registry.yarnpkg.com/@vitejs/plugin-react-swc/-/plugin-react-swc-3.7.0.tgz#e456c0a6d7f562268e1d231af9ac46b86ef47d88"