Skip to content

Commit

Permalink
chore: update count logic
Browse files Browse the repository at this point in the history
  • Loading branch information
phodal committed Dec 27, 2023
1 parent 0c468d8 commit 217e258
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 15 deletions.
22 changes: 14 additions & 8 deletions post-processor/process.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json


def process_jsonl(input_file, output_file):
with open(input_file, 'r') as f:
lines = f.readlines()
Expand All @@ -21,6 +22,8 @@ def process_jsonl(input_file, output_file):
for record in filtered_records:
f.write(json.dumps(record) + '\n')



def merge_jsonl(input_file1, input_file2, input_file3, output_file):
with open(input_file1, 'r') as f1, open(input_file2, 'r') as f2, open(input_file3, 'r') as f3:
lines1 = f1.readlines()
Expand All @@ -29,23 +32,26 @@ def merge_jsonl(input_file1, input_file2, input_file3, output_file):

records = []

# 处理第一个文件的记录
for line in lines1[:2000]:
# Process the records from the first file
for line in lines1[:4000]:
data = json.loads(line)
records.append(data)

# 处理第二个文件的记录
for line in lines2[:2000]:

random.shuffle(lines2)
# Process the records from the second file
for line in lines2[:1000]:
data = json.loads(line)
records.append(data)


# 处理第三个文件的记录
for line in lines3[:2000]:
random.shuffle(lines3)
# Process the records from the third file
for line in lines3[:1000]:
data = json.loads(line)
records.append(data)

# 写入合并后的记录到输出文件
random.shuffle(records)
# Write the merged records to the output file
with open(output_file, 'w') as f:
for record in records:
f.write(json.dumps(record) + '\n')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ data class BasicTestIns(
TestCodeBuilderType.METHOD_UNIT -> {
input.append("\nStart test code with `@Test` syntax here: \n")
}
else -> {}
}

return Instruction(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,6 @@ class ThresholdChecker(private val context: WorkerContext) {
*/
fun isMetThreshold(ins: Instruction): Boolean {
val totalToken = enc.encode(ins.instruction + ins.input + ins.output).size
if (totalToken > context.qualityThreshold.maxTokenLength) {
logger.info("skip instruction ${ins.instruction} for over ${context.qualityThreshold.maxTokenLength} tokens")
return false
}

return true
return totalToken <= context.qualityThreshold.maxTokenLength
}
}

0 comments on commit 217e258

Please sign in to comment.