Skip to content

Commit

Permalink
Merge pull request #44 from microsoft/dev
Browse files Browse the repository at this point in the history
[deploy] updates to enable provide cleaning instructions
  • Loading branch information
Chenglong-MS authored Oct 24, 2024
2 parents e26816e + 5e1f848 commit b093c01
Show file tree
Hide file tree
Showing 8 changed files with 67 additions and 25 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ Play with Data Formulator with one of the following options:

Data Formulator will be automatically opened in the browser at [http://localhost:5000](http://localhost:5000).

*Update: you can specify the port number (e.g., 8080) by `python -m data_formulator --port 8080` if the default port is occupied.*

- **Option 2: Codespaces (5 minutes)**

You can also run Data Formulator in Codespaces; we have everything pre-configured. For more details, see [CODESPACES.md](CODESPACES.md).
Expand Down
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
"@mui/icons-material": "^5.14.0",
"@mui/material": "^5.6.0",
"@reduxjs/toolkit": "^1.8.6",
"@types/dompurify": "^3.0.5",
"@types/validator": "^13.12.2",
"ag-grid-community": "^32.0.2",
"ag-grid-enterprise": "^32.0.2",
"ag-grid-react": "^32.0.2",
Expand Down
12 changes: 10 additions & 2 deletions py-src/data_formulator/agents/agent_data_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
- the csv table should have the same number of cells for each line, according to the title. If there are some rows with missing values, patch them with empty cells.
- if the raw data has some rows that do not belong to the table, also remove them (e.g., subtitles in between rows)
- if the header row misses some columns, add their corresponding column names. E.g., when the header doesn't have an index column, but every row has an index value, add the missing column header.
* clean up messy column names:
- if the column name contains special characters like "*", "?", "#", "." remove them.
* clean up columns with messy information
- if a column is number but some cells has annotations like "*" "?" or brackets, clean them up.
- if a column is number but has units like ($, %, s), convert them to number (make sure unit conversion is correct when multiple units exist like minute and second) and include unit in the header.
Expand Down Expand Up @@ -80,7 +82,7 @@ def __init__(self, client, model):
self.model = model
self.client = client

def run(self, content_type, raw_data):
def run(self, content_type, raw_data, image_cleaning_instruction):
"""derive a new concept based on the raw input data
"""

Expand All @@ -93,6 +95,12 @@ def run(self, content_type, raw_data):
}]
}
elif content_type == "image":
# add additional cleaning instructions if provided
if image_cleaning_instruction:
cleaning_prompt = f"\n\n[CLEANING INSTRUCTION]\n\n{image_cleaning_instruction}\n\n"
else:
cleaning_prompt = ""

user_prompt = {
'role': 'user',
'content': [ {
Expand All @@ -107,7 +115,7 @@ def run(self, content_type, raw_data):
},
{
'type': 'text',
'text': '''[OUTPUT]\n\n'''
'text': f'''{cleaning_prompt}[OUTPUT]\n\n'''
},
]
}
Expand Down
2 changes: 1 addition & 1 deletion py-src/data_formulator/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def clean_data_request():

agent = DataCleanAgent(client=client, model=model)

candidates = agent.run(content['content_type'], content["raw_data"])
candidates = agent.run(content['content_type'], content["raw_data"], content["image_cleaning_instruction"])

candidates = [c for c in candidates if c['status'] == 'ok']

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "data_formulator"
version = "0.1.3b"
version = "0.1.3c"

requires-python = ">=3.9"
authors = [
Expand Down
4 changes: 4 additions & 0 deletions src/data/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ export const createTableFromFromObjectArray = (title: string, values: any[], der
}
return newName;
}
// clean up messy column names
if (name && name.includes(".")) {
return name.replace(".", "_");
}
return name;
})

Expand Down
51 changes: 30 additions & 21 deletions src/views/TableSelectionView.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,7 @@ export const TableCopyDialogV2: React.FC<TableCopyDialogProps> = ({ buttonElemen
const [tableName, setTableName] = useState<string>("");

const [tableContent, setTableContent] = useState<string>("");
const [imageCleaningInstr, setImageCleaningInstr] = useState<string>("");
const [tableContentType, setTableContentType] = useState<'text' | 'image'>('text');

const [cleaningInProgress, setCleaningInProgress] = useState<boolean>(false);
Expand Down Expand Up @@ -476,6 +477,7 @@ export const TableCopyDialogV2: React.FC<TableCopyDialogProps> = ({ buttonElemen
token: token,
content_type: tableContentType,
raw_data: tableContent,
image_cleaning_instruction: imageCleaningInstr,
model: activeModel
}),
};
Expand Down Expand Up @@ -652,27 +654,33 @@ export const TableCopyDialogV2: React.FC<TableCopyDialogProps> = ({ buttonElemen
label="data content" variant="outlined" multiline minRows={15}
/>
:
<Box sx={{marginTop: 1, position: 'relative'}}>
{cleaningInProgress ? <LinearProgress sx={{ width: '100%', height: "calc(100% - 4px)", opacity: 0.1, position: 'absolute', zIndex: 1 }} /> : ""}
<IconButton size="small" color="primary"
sx={{ backgroundColor: 'white',
width: 16, height: 16, boxShadow: 3,
position: 'absolute', right: 4, top: 4,
"&:hover": { backgroundColor: "white", boxShadow: 8, transform: "translate(0.5px, -0.5px)" }
}}
onClick={() => {
setTableContent("");
setTableContentType("text");
}}
>
<CancelIcon sx={{fontSize: 16}} />
</IconButton>
{validator.isURL(tableContent) || validator.isDataURI(tableContent) ? (
<img style={{border: '1px lightgray solid', borderRadius: 4, maxWidth: 640, maxHeight: 360}}
src={DOMPurify.sanitize(tableContent)} alt="the image is corrupted, please try again." />
) : (
<Typography color="error">Invalid image data</Typography>
)}
<Box sx={{display: 'flex', flexDirection: 'column', alignItems: 'center'}}>
<Box sx={{marginTop: 1, position: 'relative'}}>
{cleaningInProgress ? <LinearProgress sx={{ width: '100%', height: "calc(100% - 4px)", opacity: 0.1, position: 'absolute', zIndex: 1 }} /> : ""}
<IconButton size="small" color="primary"
sx={{ backgroundColor: 'white',
width: 16, height: 16, boxShadow: 3,
position: 'absolute', right: 4, top: 4,
"&:hover": { backgroundColor: "white", boxShadow: 8, transform: "translate(0.5px, -0.5px)" }
}}
onClick={() => {
setTableContent("");
setTableContentType("text");
setImageCleaningInstr("");
}}
>
<CancelIcon sx={{fontSize: 16}} />
</IconButton>
{validator.isURL(tableContent) || validator.isDataURI(tableContent) ? (
<img style={{border: '1px lightgray solid', borderRadius: 4, maxWidth: 640, maxHeight: 360}}
src={DOMPurify.sanitize(tableContent)} alt="the image is corrupted, please try again." />
) : (
<Typography color="error">Invalid image data</Typography>
)}
</Box>
<TextField fullWidth size="small" sx={{ marginTop: 1, "& .MuiInputBase-input" : {fontSize: 14, lineHeight: 1.2 }}}
value={imageCleaningInstr} onChange={(event) => { setImageCleaningInstr(event.target.value); }}
variant="standard" placeholder='additional cleaning instructions' />
</Box>)
}
</Box>
Expand Down Expand Up @@ -708,6 +716,7 @@ export const TableCopyDialogV2: React.FC<TableCopyDialogProps> = ({ buttonElemen
{"upload"}
</Button>
</DialogActions>

</Dialog>;

return <>
Expand Down
17 changes: 17 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,13 @@
"@types/d3-transition" "*"
"@types/d3-zoom" "*"

"@types/dompurify@^3.0.5":
version "3.0.5"
resolved "https://registry.npmjs.org/@types/dompurify/-/dompurify-3.0.5.tgz#02069a2fcb89a163bacf1a788f73cb415dd75cb7"
integrity sha512-1Wg0g3BtQF7sSb27fJQAKck1HECM6zV1EB66j8JH9i3LCjYabJa0FSdiSgsD5K/RbrsR0SiraKacLB+T8ZVYAg==
dependencies:
"@types/trusted-types" "*"

"@types/estree@1.0.5", "@types/estree@^1.0.0":
version "1.0.5"
resolved "https://registry.npmjs.org/@types/estree/-/estree-1.0.5.tgz#a6ce3e556e00fd9895dd872dd172ad0d4bd687f4"
Expand Down Expand Up @@ -1056,11 +1063,21 @@
resolved "https://registry.npmjs.org/@types/scheduler/-/scheduler-0.16.3.tgz#cef09e3ec9af1d63d2a6cc5b383a737e24e6dcf5"
integrity sha512-5cJ8CB4yAx7BH1oMvdU0Jh9lrEXyPkar6F9G/ERswkCuvP4KQZfZkSjcMbAICCpQTN4OuZn8tz0HiKv9TGZgrQ==

"@types/trusted-types@*":
version "2.0.7"
resolved "https://registry.npmjs.org/@types/trusted-types/-/trusted-types-2.0.7.tgz#baccb07a970b91707df3a3e8ba6896c57ead2d11"
integrity sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==

"@types/use-sync-external-store@^0.0.3":
version "0.0.3"
resolved "https://registry.npmjs.org/@types/use-sync-external-store/-/use-sync-external-store-0.0.3.tgz"
integrity sha512-EwmlvuaxPNej9+T4v5AuBPJa2x2UOJVdjCtDHgcDqitUeOtjnJKJ+apYjVcAoBEMjKW1VVFGZLUb5+qqa09XFA==

"@types/validator@^13.12.2":
version "13.12.2"
resolved "https://registry.npmjs.org/@types/validator/-/validator-13.12.2.tgz#760329e756e18a4aab82fc502b51ebdfebbe49f5"
integrity sha512-6SlHBzUW8Jhf3liqrGGXyTJSIFe4nqlJ5A5KaMZ2l/vbM3Wh3KSybots/wfWVzNLK4D1NZluDlSQIbIEPx6oyA==

"@vitejs/plugin-react-swc@^3.7.0":
version "3.7.0"
resolved "https://registry.yarnpkg.com/@vitejs/plugin-react-swc/-/plugin-react-swc-3.7.0.tgz#e456c0a6d7f562268e1d231af9ac46b86ef47d88"
Expand Down

0 comments on commit b093c01

Please sign in to comment.