Skip to content

Commit

Permalink
add tools
Browse files Browse the repository at this point in the history
  • Loading branch information
ManonGlloy committed May 18, 2024
1 parent f3f648b commit 4d57b51
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 0 deletions.
10 changes: 10 additions & 0 deletions utils/extract_samples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
def extract_elements(input_file, output_file, num_elements):
with open(input_file, 'r') as f_in:
lines = f_in.readlines()[:num_elements]

with open(output_file, 'w') as f_out:
for line in lines:
f_out.write(line)


# extract_elements(input_file="path_to_input_file", output_file="path_to_output_file")
22 changes: 22 additions & 0 deletions utils/json_to_markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import json


def transform_to_markdown(input_file, output_file):
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
for line in infile:
data = json.loads(line)
outfile.write(f"id : {data['id']}\n")
outfile.write(f"name : {data['name']}\n")
outfile.write("instruction :\n")
outfile.write("```gherkin\n")
outfile.write(data['instruction'] + '\n')
outfile.write("```\n")
outfile.write("input = ")
outfile.write(json.dumps(data['instances'][0]['input']) + "\n")
outfile.write("output:\n")
outfile.write("```python\n")
outfile.write(data['instances'][0]['output'] + "\n")
outfile.write("```\n\n")


# transform_to_markdown(input_file="path_to_input_file", output_file="path_to_output_file")
44 changes: 44 additions & 0 deletions utils/markdown_to_jsonl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import json


def markdown_to_jsonl(input_file, output_file):
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
tasks = []
task = None
for line in infile:
line = line.strip()
if line.startswith("id :"):
if task:
tasks.append(task)
task = {'is_classification': False}
task['id'] = line.split(" : ")[1]
elif line.startswith("name :"):
task['name'] = line.split(" : ")[1]
elif line == "instruction :":
instruction_lines = []
while True:
next_line = infile.readline().strip()
if next_line.startswith("input") or next_line.startswith("output"):
break
instruction_lines.append(next_line)
task['instruction'] = "\n\t".join(instruction_lines)
elif line.startswith("```python"):
output_lines = []
while True:
next_line = infile.readline().strip()
if next_line.startswith("```"):
break
output_lines.append(next_line)
if 'instances' not in task:
task['instances'] = [{'input': "", 'output': ''}]
task['instances'][0]['output'] = "\n".join(output_lines)
if task:
tasks.append(task)
for t in tasks:
# Rearrange the dictionary to place the is_classification key as the last key
t['is_classification'] = t.pop('is_classification')
json.dump(t, outfile)
outfile.write('\n')


# markdown_to_jsonl(input_file="path_to_input_file", output_file="path_to_output_file")

0 comments on commit 4d57b51

Please sign in to comment.