Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updates to enable provide cleaning instructions #44

Merged
merged 1 commit into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ Play with Data Formulator with one of the following options:

Data Formulator will be automatically opened in the browser at [http://localhost:5000](http://localhost:5000).

*Update: you can specify the port number (e.g., 8080) by `python -m data_formulator --port 8080` if the default port is occupied.*

- **Option 2: Codespaces (5 minutes)**

You can also run Data Formulator in Codespaces; we have everything pre-configured. For more details, see [CODESPACES.md](CODESPACES.md).
Expand Down
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
"@mui/icons-material": "^5.14.0",
"@mui/material": "^5.6.0",
"@reduxjs/toolkit": "^1.8.6",
"@types/dompurify": "^3.0.5",
"@types/validator": "^13.12.2",
"ag-grid-community": "^32.0.2",
"ag-grid-enterprise": "^32.0.2",
"ag-grid-react": "^32.0.2",
Expand Down
12 changes: 10 additions & 2 deletions py-src/data_formulator/agents/agent_data_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
- the csv table should have the same number of cells for each line, according to the title. If there are some rows with missing values, patch them with empty cells.
- if the raw data has some rows that do not belong to the table, also remove them (e.g., subtitles in between rows)
- if the header row misses some columns, add their corresponding column names. E.g., when the header doesn't have an index column, but every row has an index value, add the missing column header.
* clean up messy column names:
- if the column name contains special characters like "*", "?", "#", "." remove them.
* clean up columns with messy information
- if a column is number but some cells has annotations like "*" "?" or brackets, clean them up.
- if a column is number but has units like ($, %, s), convert them to number (make sure unit conversion is correct when multiple units exist like minute and second) and include unit in the header.
Expand Down Expand Up @@ -80,7 +82,7 @@ def __init__(self, client, model):
self.model = model
self.client = client

def run(self, content_type, raw_data):
def run(self, content_type, raw_data, image_cleaning_instruction):
"""derive a new concept based on the raw input data
"""

Expand All @@ -93,6 +95,12 @@ def run(self, content_type, raw_data):
}]
}
elif content_type == "image":
# add additional cleaning instructions if provided
if image_cleaning_instruction:
cleaning_prompt = f"\n\n[CLEANING INSTRUCTION]\n\n{image_cleaning_instruction}\n\n"
else:
cleaning_prompt = ""

user_prompt = {
'role': 'user',
'content': [ {
Expand All @@ -107,7 +115,7 @@ def run(self, content_type, raw_data):
},
{
'type': 'text',
'text': '''[OUTPUT]\n\n'''
'text': f'''{cleaning_prompt}[OUTPUT]\n\n'''
},
]
}
Expand Down
2 changes: 1 addition & 1 deletion py-src/data_formulator/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def clean_data_request():

agent = DataCleanAgent(client=client, model=model)

candidates = agent.run(content['content_type'], content["raw_data"])
candidates = agent.run(content['content_type'], content["raw_data"], content["image_cleaning_instruction"])

candidates = [c for c in candidates if c['status'] == 'ok']

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "data_formulator"
version = "0.1.3b"
version = "0.1.3c"

requires-python = ">=3.9"
authors = [
Expand Down
4 changes: 4 additions & 0 deletions src/data/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ export const createTableFromFromObjectArray = (title: string, values: any[], der
}
return newName;
}
// clean up messy column names
if (name && name.includes(".")) {
return name.replace(".", "_");
}
return name;
})

Expand Down
51 changes: 30 additions & 21 deletions src/views/TableSelectionView.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,7 @@ export const TableCopyDialogV2: React.FC<TableCopyDialogProps> = ({ buttonElemen
const [tableName, setTableName] = useState<string>("");

const [tableContent, setTableContent] = useState<string>("");
const [imageCleaningInstr, setImageCleaningInstr] = useState<string>("");
const [tableContentType, setTableContentType] = useState<'text' | 'image'>('text');

const [cleaningInProgress, setCleaningInProgress] = useState<boolean>(false);
Expand Down Expand Up @@ -476,6 +477,7 @@ export const TableCopyDialogV2: React.FC<TableCopyDialogProps> = ({ buttonElemen
token: token,
content_type: tableContentType,
raw_data: tableContent,
image_cleaning_instruction: imageCleaningInstr,
model: activeModel
}),
};
Expand Down Expand Up @@ -652,27 +654,33 @@ export const TableCopyDialogV2: React.FC<TableCopyDialogProps> = ({ buttonElemen
label="data content" variant="outlined" multiline minRows={15}
/>
:
<Box sx={{marginTop: 1, position: 'relative'}}>
{cleaningInProgress ? <LinearProgress sx={{ width: '100%', height: "calc(100% - 4px)", opacity: 0.1, position: 'absolute', zIndex: 1 }} /> : ""}
<IconButton size="small" color="primary"
sx={{ backgroundColor: 'white',
width: 16, height: 16, boxShadow: 3,
position: 'absolute', right: 4, top: 4,
"&:hover": { backgroundColor: "white", boxShadow: 8, transform: "translate(0.5px, -0.5px)" }
}}
onClick={() => {
setTableContent("");
setTableContentType("text");
}}
>
<CancelIcon sx={{fontSize: 16}} />
</IconButton>
{validator.isURL(tableContent) || validator.isDataURI(tableContent) ? (
<img style={{border: '1px lightgray solid', borderRadius: 4, maxWidth: 640, maxHeight: 360}}
src={DOMPurify.sanitize(tableContent)} alt="the image is corrupted, please try again." />
) : (
<Typography color="error">Invalid image data</Typography>
)}
<Box sx={{display: 'flex', flexDirection: 'column', alignItems: 'center'}}>
<Box sx={{marginTop: 1, position: 'relative'}}>
{cleaningInProgress ? <LinearProgress sx={{ width: '100%', height: "calc(100% - 4px)", opacity: 0.1, position: 'absolute', zIndex: 1 }} /> : ""}
<IconButton size="small" color="primary"
sx={{ backgroundColor: 'white',
width: 16, height: 16, boxShadow: 3,
position: 'absolute', right: 4, top: 4,
"&:hover": { backgroundColor: "white", boxShadow: 8, transform: "translate(0.5px, -0.5px)" }
}}
onClick={() => {
setTableContent("");
setTableContentType("text");
setImageCleaningInstr("");
}}
>
<CancelIcon sx={{fontSize: 16}} />
</IconButton>
{validator.isURL(tableContent) || validator.isDataURI(tableContent) ? (
<img style={{border: '1px lightgray solid', borderRadius: 4, maxWidth: 640, maxHeight: 360}}
src={DOMPurify.sanitize(tableContent)} alt="the image is corrupted, please try again." />
) : (
<Typography color="error">Invalid image data</Typography>
)}
</Box>
<TextField fullWidth size="small" sx={{ marginTop: 1, "& .MuiInputBase-input" : {fontSize: 14, lineHeight: 1.2 }}}
value={imageCleaningInstr} onChange={(event) => { setImageCleaningInstr(event.target.value); }}
variant="standard" placeholder='additional cleaning instructions' />
</Box>)
}
</Box>
Expand Down Expand Up @@ -708,6 +716,7 @@ export const TableCopyDialogV2: React.FC<TableCopyDialogProps> = ({ buttonElemen
{"upload"}
</Button>
</DialogActions>

</Dialog>;

return <>
Expand Down
17 changes: 17 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,13 @@
"@types/d3-transition" "*"
"@types/d3-zoom" "*"

"@types/dompurify@^3.0.5":
version "3.0.5"
resolved "https://registry.npmjs.org/@types/dompurify/-/dompurify-3.0.5.tgz#02069a2fcb89a163bacf1a788f73cb415dd75cb7"
integrity sha512-1Wg0g3BtQF7sSb27fJQAKck1HECM6zV1EB66j8JH9i3LCjYabJa0FSdiSgsD5K/RbrsR0SiraKacLB+T8ZVYAg==
dependencies:
"@types/trusted-types" "*"

"@types/estree@1.0.5", "@types/estree@^1.0.0":
version "1.0.5"
resolved "https://registry.npmjs.org/@types/estree/-/estree-1.0.5.tgz#a6ce3e556e00fd9895dd872dd172ad0d4bd687f4"
Expand Down Expand Up @@ -1056,11 +1063,21 @@
resolved "https://registry.npmjs.org/@types/scheduler/-/scheduler-0.16.3.tgz#cef09e3ec9af1d63d2a6cc5b383a737e24e6dcf5"
integrity sha512-5cJ8CB4yAx7BH1oMvdU0Jh9lrEXyPkar6F9G/ERswkCuvP4KQZfZkSjcMbAICCpQTN4OuZn8tz0HiKv9TGZgrQ==

"@types/trusted-types@*":
version "2.0.7"
resolved "https://registry.npmjs.org/@types/trusted-types/-/trusted-types-2.0.7.tgz#baccb07a970b91707df3a3e8ba6896c57ead2d11"
integrity sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==

"@types/use-sync-external-store@^0.0.3":
version "0.0.3"
resolved "https://registry.npmjs.org/@types/use-sync-external-store/-/use-sync-external-store-0.0.3.tgz"
integrity sha512-EwmlvuaxPNej9+T4v5AuBPJa2x2UOJVdjCtDHgcDqitUeOtjnJKJ+apYjVcAoBEMjKW1VVFGZLUb5+qqa09XFA==

"@types/validator@^13.12.2":
version "13.12.2"
resolved "https://registry.npmjs.org/@types/validator/-/validator-13.12.2.tgz#760329e756e18a4aab82fc502b51ebdfebbe49f5"
integrity sha512-6SlHBzUW8Jhf3liqrGGXyTJSIFe4nqlJ5A5KaMZ2l/vbM3Wh3KSybots/wfWVzNLK4D1NZluDlSQIbIEPx6oyA==

"@vitejs/plugin-react-swc@^3.7.0":
version "3.7.0"
resolved "https://registry.yarnpkg.com/@vitejs/plugin-react-swc/-/plugin-react-swc-3.7.0.tgz#e456c0a6d7f562268e1d231af9ac46b86ef47d88"
Expand Down