Skip to content

Commit

Permalink
Merge branch 'mikecafarella:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
YohannParis authored Aug 16, 2023
2 parents 46a4337 + d11484e commit 0fd4f71
Show file tree
Hide file tree
Showing 57 changed files with 12,229 additions and 184 deletions.
22 changes: 21 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,36 @@ RUN apt-get update && apt-get install -y \
libgraphviz-dev \
pkg-config

# set up locale
RUN apt-get update && apt-get install -y locales locales-all
RUN echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && locale-gen en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
ENV LC_ALL en_US.UTF-8
ENV LC_NUMERIC en_US.UTF-8
#RUN dpkg-reconfigure locales


# Automates clone and install
RUN mkdir /automates
RUN git clone https://github.com/ml4ai/automates.git ./automates
WORKDIR /automates
RUN pip install -e .

WORKDIR /
# local KG
# get mira KG
RUN curl -o epi_2023-07-07_nodes.tsv.gz https://askem-mira.s3.amazonaws.com/dkg/epi/build/2023-07-07/nodes.tsv.gz
RUN gunzip epi_2023-07-07_nodes.tsv

# Copy src code in and start API
COPY . /
WORKDIR /
ENV PYTHONPATH "${PYTHONPATH}:/src"
RUN pip install -r requirements.txt



WORKDIR /api


CMD ["uvicorn", "server:app", "--reload", "--host", "0.0.0.0", "--port", "8000"]
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,16 @@ This repository contains the code and products produced by the MIT team as part
The MIT team consists of (in alphabetical order):
- [Michael Cafarella](https://www.csail.mit.edu/person/michael-cafarella)
- [Peter Baile Chen](https://peterbaile.github.io/)
- [Wenjia He](https://web.eecs.umich.edu/~wenjiah/)
- [Chunwei Liu](https://people.csail.mit.edu/chunwei/)
- [Markos Markakis](https://people.csail.mit.edu/markakis/)
- [Oscar Moll](https://www.csail.mit.edu/person/oscar-ricardo-moll-thomae)
- [Theo Olausson](https://theoxo.xyz/)
- [Anna Zeng](https://people.csail.mit.edu/annazeng/)


## Public API

Our functionality is provided via a public API available [here](http://100.26.10.46/). Many of the calls also require you to provide a GPT key, which you can obtain from [OpenAI](https://beta.openai.com/login/).
Our functionality is provided via a public API available [here](http://3.83.68.208/). Many of the calls also require you to provide a GPT key, which you can obtain from [OpenAI](https://beta.openai.com/login/).

For examples of usage, you can refer to our most recent demo [here](https://github.com/mikecafarella/mitaskem/blob/d26ccfb57b3605e54dd0068510f18c9b19f0b599/demos/2023-02-01/mit-feb1-demo.ipynb).
75 changes: 60 additions & 15 deletions api/routers/annotation.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,29 @@
import ast, io, random, sys, os
import ast, io, random, sys, os, csv

from fastapi import APIRouter, status, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse

from file_cache import save_file_to_cache
from mit_extraction import async_mit_extraction_restAPI
from mit_extraction import async_mit_extraction_restAPI, afind_vars_from_text
from typing import Dict, Optional

sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
)
from src.text_search import text_var_search, vars_to_json, vars_dedup
from src.connect import vars_formula_connection, dataset_header_document_dkg, vars_dataset_connection_simplified
from src.connect import vars_formula_connection, dataset_header_document_dkg, vars_dataset_connection_simplified, profile_matrix, get_dataset_type, process_data
from src.link_annos_to_pyacset import link_annos_to_pyacset
from src.response_types import TabularProfile, MatrixProfile

router = APIRouter()


@router.post("/find_text_vars", tags=["Paper-2-annotated-vars"])
def find_variables_from_text(text: str, gpt_key: str):
async def find_variables_from_text(gpt_key: str, file: UploadFile = File(...)) -> JSONResponse:

contents = await file.read()
json_str = await afind_vars_from_text(contents.decode(), gpt_key)
return json_str

length = len(text)
segments = int(length/1000 + 1)
Expand All @@ -36,7 +42,7 @@ def find_variables_from_text(text: str, gpt_key: str):
return ast.literal_eval(vars_to_json(vars_dedup(outputs)))

@router.post("/link_datasets_to_vars", tags=["Paper-2-annotated-vars"])
def link_dataset_columns_to_extracted_variables(json_str: str, dataset_str: str, gpt_key: str):
def link_dataset_columns_to_extracted_variables(json_str: str, dataset_str: str, gpt_key: str) -> JSONResponse:
s, success = vars_dataset_connection_simplified(json_str=json_str, dataset_str=dataset_str, gpt_key=gpt_key)

if not success:
Expand All @@ -45,7 +51,7 @@ def link_dataset_columns_to_extracted_variables(json_str: str, dataset_str: str,
return ast.literal_eval(s)

@router.post("/link_latex_to_vars", tags=["Paper-2-annotated-vars"])
def link_latex_formulas_to_extracted_variables(json_str: str, formula: str, gpt_key: str):
def link_latex_formulas_to_extracted_variables(json_str: str, formula: str, gpt_key: str) -> JSONResponse:
s, success = vars_formula_connection(json_str=json_str, formula=formula, gpt_key=gpt_key)

if not success:
Expand All @@ -54,30 +60,68 @@ def link_latex_formulas_to_extracted_variables(json_str: str, formula: str, gpt_
return ast.literal_eval(s)

@router.post("/link_annos_to_pyacset", tags=["Paper-2-annotated-vars"])
def link_annotation_to_pyacset_and_paper_info(pyacset_str: str, annotations_str: str, info_str: str = ""):
def link_annotation_to_pyacset_and_paper_info(pyacset_str: str, annotations_str: str, info_str: str = "") -> JSONResponse:
s = link_annos_to_pyacset(pyacset_s = pyacset_str, annos_s = annotations_str, info_s = info_str)

return ast.literal_eval(s)


@router.post("/link_dataset_col_to_dkg", tags=["Paper-2-annotated-vars"])
async def link_dataset_columns_to_dkg_info(gpt_key: str, csv_file: UploadFile = File(...), doc_file: UploadFile = File(...)):
@router.post("/profile_matrix_data", tags=["Paper-2-annotated-vars"], response_model=MatrixProfile)
async def profile_matrix_data(gpt_key: str, csv_file: UploadFile = File(...), doc_file: UploadFile = File(...)) -> JSONResponse:

csv_string = await csv_file.read()
csv_str = csv_string.decode()
csv_reader = csv.reader(io.StringIO(csv_str), dialect=csv.Sniffer().sniff(csv_str.splitlines()[-1]))

doc = await doc_file.read()
doc = doc.decode()

header = next(csv_reader)
dataset_type = get_dataset_type(header)
if dataset_type != 'matrix':
return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content="Invalid CSV file; data type does not seem to be a matrix.")
data = header.extend(csv_reader) # make sure header is included in data
data = process_data(data)

s, success = await profile_matrix(data=data, doc=doc, dataset_name=csv_file.filename, doc_name=doc_file.filename, gpt_key=gpt_key, smart=smart)

if not success:
return JSONResponse(status_code=status.HTTP_401_UNAUTHORIZED, content=s)

return ast.literal_eval(s)

@router.post("/link_dataset_col_to_dkg", tags=["Paper-2-annotated-vars"], response_model=Dict[str, TabularProfile])
async def link_dataset_columns_to_dkg_info(gpt_key: str, csv_file: UploadFile = File(...),
doc_file: UploadFile = File(...), smart: Optional[bool] = False) -> JSONResponse:
"""
Smart run provides better results but may result in slow response times as a consequence of extra GPT calls.
"""
csv_string = await csv_file.read()
csv_string = csv_string.decode()
buf = io.StringIO(csv_string)
csv_str = buf.readline() + '\n' + '\n'.join(random.sample(buf.readlines(), 5))
csv_str = csv_string.decode()
csv_reader = csv.reader(io.StringIO(csv_str), dialect=csv.Sniffer().sniff(csv_str.splitlines()[-1]))

doc = await doc_file.read()
doc = doc.decode()
s, success = await dataset_header_document_dkg(header=csv_str, doc=doc, gpt_key=gpt_key)

header = next(csv_reader)
dataset_type = get_dataset_type(header)
if dataset_type == 'matrix':
return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content="Invalid CSV file; seems to be a matrix, not tabular.")
elif dataset_type == 'no-header':
return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content="Invalid CSV file; no header found.")

data = [header]
data.extend(csv_reader) # make sure header is included in data
data = process_data(data)
s, success = await dataset_header_document_dkg(data=data, doc=doc, dataset_name=csv_file.filename, doc_name=doc_file.filename, gpt_key=gpt_key, smart=smart)

if not success:
return JSONResponse(status_code=status.HTTP_401_UNAUTHORIZED, content=s)

return ast.literal_eval(s)

# @router.post("/link_dataset_col_to_dkg", tags=["Paper-2-annotated-vars"])
# def link_dataset_columns_to_dkg_info(csv_str: str, gpt_key: str):
# def link_dataset_columns_to_dkg_info(csv_str: str, gpt_key: str) -> JSONResponse:
# s, success = dataset_header_dkg(header=csv_str, gpt_key=gpt_key)
#
# if not success:
Expand All @@ -87,7 +131,7 @@ async def link_dataset_columns_to_dkg_info(gpt_key: str, csv_file: UploadFile =


@router.post("/upload_file_extract/", tags=["Paper-2-annotated-vars"])
async def upload_file_annotate(gpt_key: str, file: UploadFile = File(...)):
async def upload_file_annotate(gpt_key: str, file: UploadFile = File(...)) -> JSONResponse:
"""
User Warning: Calling APIs may result in slow response times as a consequence of GPT-4.
"""
Expand All @@ -102,4 +146,5 @@ async def upload_file_annotate(gpt_key: str, file: UploadFile = File(...)):

# return {"file name": res_file, "file contents": text}
except Exception as e:
print(str(e))
raise HTTPException(status_code=400, detail=str(e))
128 changes: 105 additions & 23 deletions api/routers/cards.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,73 @@
import csv
from math import isnan
import ast, io, random, sys, os
import asyncio
from typing import Optional, Union

from openai import OpenAIError
from fastapi import APIRouter, status, UploadFile, File
from fastapi.responses import JSONResponse

sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
)
from src.connect import construct_data_card, dataset_header_document_dkg, construct_model_card
from src.connect import construct_data_card, dataset_header_document_dkg, construct_model_card, profile_matrix, get_dataset_type, process_data
from src.response_types import MatrixDataCard, TabularDataCard, ModelCard

router = APIRouter()

@router.post("/get_data_card", tags=["Data-and-model-cards"])
async def get_data_card(gpt_key: str, csv_file: UploadFile = File(...), doc_file: UploadFile = File(...)):

@router.post("/get_data_card", tags=["Data-and-model-cards"], response_model=Union[MatrixDataCard, TabularDataCard])
async def get_data_card(gpt_key: str, csv_file: UploadFile = File(...), doc_file: UploadFile = File(...), smart: Optional[bool] = False) -> JSONResponse:
"""
Smart run provides better results but may result in slow response times as a consequence of extra GPT calls.
"""
files = [csv_file.read(), doc_file.read()]
csv, doc = await asyncio.gather(*files)
_csv, doc = await asyncio.gather(*files)
_csv = _csv.decode().strip()
doc = doc.decode().strip()

# process CSV; get header and <= 5 random rows
csv_string = csv.decode()
csv_strings = csv_string.split('\n')
if len(csv_strings) == 0:
# TODO handle inputs that are too long to fit in the context window
if len(_csv) == 0:
return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content="Empty CSV file")
num_rows_to_sample = min(5, len(csv_strings) - 1)
csv_str = csv_strings[0] + '\n' + '\n'.join(random.sample(csv_strings[1:], num_rows_to_sample))

# process doc
# TODO: handle docs that are too long to fit in the context window
doc = doc.decode()
if len(doc) == 0:
return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content="Empty document file")

csv_reader = csv.reader(io.StringIO(_csv), dialect=csv.Sniffer().sniff(_csv.splitlines()[-1]))

header = next(csv_reader) # can determine type from the header row
data_type = get_dataset_type(header)
if data_type == 'header-0':
schema = header
profiler = dataset_header_document_dkg
elif data_type == 'no-header':
# Probably best not to support this; the code path is poorly tested, and it's not clear what the expected behavior is.
# Either way, this should never come up in the Evaluation.
#schema = None
#profiler = dataset_header_dkg
return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content="Invalid CSV file; no header found.")
elif data_type == 'matrix':
schema = None
profiler = profile_matrix
else:
return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content="Invalid CSV file; could not determine data type")

data = [header]
data.extend(csv_reader) # make sure header is included in data
data = process_data(data)

calls = [
construct_data_card(data_doc=doc, dataset_name=csv_file.filename, doc_name=doc_file.filename, dataset_type=data_type, gpt_key=gpt_key),
profiler(data=data, doc=doc, dataset_name=csv_file.filename, doc_name=doc_file.filename, gpt_key=gpt_key, smart=smart)
]

try:
results = await asyncio.gather(*calls)
except OpenAIError as err:
if "maximum context" in str(err):
return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content="Input too long. Please reduce the size of your input.")
else:
return JSONResponse(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content=f"OpenAI connection error: {err}")

calls = [construct_data_card(data=csv_str, data_doc=doc, gpt_key=gpt_key), dataset_header_document_dkg(header=csv_str, doc=doc, gpt_key=gpt_key)]
results = await asyncio.gather(*calls)
for s, success in results:
if not success:
return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content=s)
Expand All @@ -39,13 +76,51 @@ async def get_data_card(gpt_key: str, csv_file: UploadFile = File(...), doc_file
data_profiling = ast.literal_eval(results[1][0])
if 'DATA_PROFILING_RESULT' in data_card:
return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content='DATA_PROFILING_RESULT cannot be a requested field in the data card.')
data_card['DATA_PROFILING_RESULT'] = data_profiling

if data_type == 'header-0':
data_card['SCHEMA'] = schema
# get a random sample of a row from the csv
data_card['EXAMPLES'] = {k.strip(): v for k, v in zip(schema, random.sample(list(data[1:]), 1)[0])}
data_card['DATA_PROFILING_RESULT'] = data_profiling
elif data_type == 'no-header':
if 'SCHEMA' not in data_card:
return JSONResponse(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content='SCHEMA not found in data card')
schema = [s.strip() for s in data_card['SCHEMA'].split(',')]
schema = [s[1:] if s.startswith('[') else s for s in schema]
schema = [s[:-1] if s.endswith(']') else s for s in schema]
aligned_data_profiling = {}
for k, v in data_profiling.items():
k = int(k)
k = schema[k]
aligned_data_profiling[k] = v
data_card['DATA_PROFILING_RESULT'] = aligned_data_profiling
data_card['EXAMPLES'] = {k.strip(): v for k, v in zip(schema, random.sample(list(data), 1)[0])}
elif data_type == 'matrix':
data_card['DATA_PROFILING_RESULT'] = data_profiling
data_card['EXAMPLES'] = random.sample(data, 1)[0]
else:
raise Exception('Invalid data type')

def _fill_nan(ex):
if isinstance(ex, dict):
for k, v in ex.items():
ex[k] = _fill_nan(v)
elif isinstance(ex, list):
for i in range(len(ex)):
ex[i] = _fill_nan(ex[i])
elif isinstance(ex, float) and isnan(ex):
ex = None
return ex

data_card['EXAMPLES'] = _fill_nan(data_card['EXAMPLES'])

data_card['DATASET_TYPE'] = "matrix" if data_type == 'matrix' else "tabular"

print(data_card)
return data_card


@router.post("/get_model_card", tags=["Data-and-model-cards"])
async def get_model_card(gpt_key: str, text_file: UploadFile = File(...), code_file: UploadFile = File(...)):
@router.post("/get_model_card", tags=["Data-and-model-cards"], response_model=ModelCard)
async def get_model_card(gpt_key: str, text_file: UploadFile = File(...), code_file: UploadFile = File(...)) -> JSONResponse:

files = [text_file.read(), code_file.read()]
text, code = await asyncio.gather(*files)
Expand All @@ -56,8 +131,15 @@ async def get_model_card(gpt_key: str, text_file: UploadFile = File(...), code_f
# process code
code_string = code.decode()

res, success = await construct_model_card(text=text_string, code=code_string, gpt_key=gpt_key)
try:
res, success = await construct_model_card(text=text_string, code=code_string, gpt_key=gpt_key)
except OpenAIError as err:
if "maximum context" in str(err):
return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content="Input too long. Please reduce the size of your input.")
else:
return JSONResponse(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content=f"OpenAI connection error: {err}")

if not success:
return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content=res)
model_card = ast.literal_eval(res)
return model_card
return model_card
Loading

0 comments on commit 0fd4f71

Please sign in to comment.