Skip to content

Commit 4af4ea5

Browse files
committed
Prepare for pypi release
1 parent f476f86 commit 4af4ea5

File tree

68 files changed

+39556
-125
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+39556
-125
lines changed

pyproject.toml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
[build-system]
2+
requires = ["setuptools>=42", "wheel"]
3+
build-backend = "setuptools.build_meta"
4+
5+
[project]
6+
name = "syncode"
7+
version = "0.4.0"
8+
description = "Grammar-guided code generation tool"
9+
readme = "README.md"
10+
authors = [
11+
{name = "Shubham Ugare", email = "shubhamugare@gmail.com"}
12+
]
13+
license = {text = "MIT"}
14+
classifiers = [
15+
"Programming Language :: Python :: 3",
16+
"Intended Audience :: Science/Research",
17+
"License :: OSI Approved :: MIT License",
18+
"Operating System :: OS Independent",
19+
]
20+
dependencies = [
21+
"fire",
22+
"interegular",
23+
"regex==2023.8.8",
24+
"torch",
25+
"tqdm",
26+
"transformers==4.44.0",
27+
"datasets",
28+
"jsonschema",
29+
]
30+
31+
[project.urls]
32+
"Homepage" = "https://github.com/shubhamugare/syncode"
33+
"Bug Tracker" = "https://github.com/shubhamugare/syncode/issues"

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,5 @@ regex==2023.8.8
44
torch
55
tqdm
66
transformers==4.44.0
7-
mxeval @ git+https://github.com/shubhamugare/mxeval.git
87
datasets
98
jsonschema

setup.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,33 @@
33
with open("README.md", "r", encoding="utf-8") as fh:
44
long_description = fh.read()
55

6-
# Read the content of the requirements.txt file
7-
with open('requirements.txt', 'r', encoding='utf-8') as f:
8-
requirements = f.read().splitlines()
6+
# Read the content of the requirements.txt file without mxeval
7+
requirements = [
8+
"fire",
9+
"interegular",
10+
"regex==2023.8.8",
11+
"torch",
12+
"tqdm",
13+
"transformers==4.44.0",
14+
"datasets",
15+
"jsonschema"
16+
]
917

1018
setuptools.setup(
1119
name="syncode",
12-
version="0.1",
20+
version="0.4.0",
1321
author="Shubham Ugare",
1422
author_email="shubhamugare@gmail.com",
1523
description="This package provides the tool for grammar augmented LLM generation.",
1624
long_description=long_description,
1725
long_description_content_type="text/markdown",
18-
url="https://github.com/shubhamugare/syncode",
26+
url="https://github.com/uiuc-focal-lab/syncode",
1927
include_package_data=True,
2028
packages=setuptools.find_packages(),
2129
install_requires=requirements,
2230
classifiers=[
2331
"Programming Language :: Python :: 3",
24-
"Intended Audience :: Science/Research",
32+
"Intended Audience :: Science/Research",
2533
"License :: OSI Approved :: MIT License",
2634
"Operating System :: OS Independent",
2735
],

syncode/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from datasets import load_dataset
2-
from mxeval.data import get_data, get_examples
2+
from syncode.evaluation.mxeval.data import get_data, get_examples
33

44
class Dataset:
55
"""

syncode/evaluation/code_eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from typing import Optional
66
from syncode import common
77
from syncode.evaluation.mxeval_evaluation import check_corectness
8-
from mxeval.data import write_jsonl
8+
from syncode.evaluation.mxeval.data import write_jsonl
99

1010

1111
class CodeEval:

syncode/evaluation/fol_eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import random
55
import re
66
from typing import Optional
7-
from mxeval.data import write_jsonl
7+
from syncode.evaluation.mxeval_evaluation import write_jsonl
88
from tqdm import tqdm
99
import signal
1010
from syncode.parsers import create_base_parser

syncode/evaluation/json_eval.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from tqdm import tqdm
22
from typing import Optional
3-
from mxeval.data import write_jsonl
3+
from syncode.evaluation.mxeval.data import write_jsonl
44
import ast
55
import json
66
from jsonschema import validate, ValidationError
@@ -18,7 +18,8 @@ def run_json_eval(
1818
out_path: Optional[str],
1919
debug_task_id: Optional[int] = None,
2020
logger=common.EmptyLogger(),
21-
prompt_type='original'
21+
prompt_type='original',
22+
num_tasks=None
2223
):
2324
problems = syncode.dataset.problems
2425
if syncode.grammar_decoder is not None:
@@ -27,6 +28,9 @@ def run_json_eval(
2728

2829
if debug_task_id is not None:
2930
problems = [problems[debug_task_id]]
31+
32+
if num_tasks is not None:
33+
problems = problems[:num_tasks]
3034

3135
samples = []
3236
outputs = []

syncode/evaluation/math_eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from tqdm import tqdm
44
from syncode import common
55
from syncode.evaluation.mxeval_evaluation import compute_pass_at_k
6-
from mxeval.data import write_jsonl
6+
from syncode.evaluation.mxeval_evaluation import write_jsonl
77

88

99
class MathEval:

syncode/evaluation/mxeval/__init__.py

Whitespace-only changes.

syncode/evaluation/mxeval/data.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
from typing import Iterable, Dict
2+
import gzip
3+
import json
4+
import os
5+
6+
7+
ROOT = os.path.dirname(os.path.abspath(__file__))
8+
MULTILINGUAL_HUMANEVAL_METADATA = os.path.join(ROOT, "data", "multilingual_humaneval", "metadata.json")
9+
with open(MULTILINGUAL_HUMANEVAL_METADATA, "r", encoding="utf-8") as fr:
10+
MULTILINGUAL_HUMANEVAL_METADATA = json.load(fr)
11+
HUMAN_EVAL_PYTHON = os.path.join(ROOT, "data", "multilingual_humaneval", MULTILINGUAL_HUMANEVAL_METADATA["python"])
12+
HUMAN_EVAL = HUMAN_EVAL_PYTHON
13+
14+
15+
def read_problems(evalset_file: str = HUMAN_EVAL_PYTHON) -> Dict[str, Dict]:
16+
return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
17+
18+
19+
def stream_jsonl(filename: str) -> Iterable[Dict]:
20+
"""
21+
Parses each jsonl line and yields it as a dictionary
22+
"""
23+
if filename.endswith(".gz"):
24+
with open(filename, "rb") as gzfp:
25+
with gzip.open(gzfp, 'rt') as fp:
26+
for line in fp:
27+
if any(not x.isspace() for x in line):
28+
yield json.loads(line)
29+
else:
30+
with open(filename, "r") as fp:
31+
for line in fp:
32+
if any(not x.isspace() for x in line):
33+
yield json.loads(line)
34+
35+
36+
def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
37+
"""
38+
Writes an iterable of dictionaries to jsonl
39+
"""
40+
if append:
41+
mode = 'ab'
42+
else:
43+
mode = 'wb'
44+
filename = os.path.expanduser(filename)
45+
if filename.endswith(".gz"):
46+
with open(filename, mode) as fp:
47+
with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
48+
for x in data:
49+
gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
50+
else:
51+
with open(filename, mode) as fp:
52+
for x in data:
53+
fp.write((json.dumps(x) + "\n").encode('utf-8'))
54+
55+
56+
def get_metadata(dataset, metadata_type="problem"):
57+
assert metadata_type in ["problem", "example"]
58+
assert dataset in ["mbxp", "multi-humaneval", "mathqa-x"], f"Unsupported dataset {dataset}"
59+
dataset_dirmap = {"mbxp": "mbxp",
60+
"multi-humaneval": "multilingual_humaneval",
61+
"mathqa-x": "multilingual_mathqa"}
62+
typemap = {"problem": "metadata.json",
63+
"example": "metadata_examples.json"}
64+
datadir = os.path.join(ROOT, "data", dataset_dirmap[dataset])
65+
path = os.path.join(datadir, typemap[metadata_type])
66+
with open(path, "r") as f:
67+
metadata = json.load(f)
68+
return metadata, datadir
69+
70+
71+
def get_supported_langs(dataset):
72+
metadata, _ = get_metadata(dataset, metadata_type="problem")
73+
return list(metadata.keys())
74+
75+
76+
def get_data(dataset="mbxp", language="python"):
77+
metadata, datadir = get_metadata(dataset, metadata_type="problem")
78+
if language.lower() not in metadata:
79+
raise ValueError(f"Language {language} not found in metadata file")
80+
datafile = metadata[language.lower()]
81+
print(f"Loading {dataset} | language = {language}")
82+
return read_problems(os.path.join(datadir, datafile))
83+
84+
85+
# due to similar format, examples from mbxp are sufficient to be used
86+
# for few-shot prompting in multi-humaneval
87+
def get_examples(dataset="mbxp", language="python", num_examples=None):
88+
assert dataset in ["mbxp"], f"No fewshot examples in dataset {dataset}"
89+
metadata, datadir = get_metadata(dataset=dataset, metadata_type="example")
90+
if language.lower() not in metadata:
91+
raise ValueError(f"Language {language} not found in metadata file")
92+
datafile = metadata[language.lower()]
93+
print(f"Loading examples from {dataset} | language = {language}")
94+
# use streams
95+
if num_examples is None:
96+
# return the entire stream
97+
return stream_jsonl(os.path.join(datadir, datafile))
98+
else:
99+
problems = get_data(dataset=dataset, language=language)
100+
stream = get_examples(dataset=dataset, language=language)
101+
examples = []
102+
for idx, example in enumerate(stream):
103+
if idx == num_examples:
104+
break
105+
task_id = example["task_id"]
106+
prompt = problems[task_id]["prompt"]
107+
example["prompt"] = prompt
108+
examples.append(example)
109+
return examples

0 commit comments

Comments
 (0)