-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathevaluation_example.rs
116 lines (99 loc) · 3.87 KB
/
evaluation_example.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
//! Example demonstrating evaluation and comparison of multiple LLM providers
//!
//! This example shows how to:
//! 1. Initialize multiple LLM providers (Anthropic, Phind, DeepSeek)
//! 2. Configure scoring functions to evaluate responses
//! 3. Send the same prompt to all providers
//! 4. Compare and score the responses
use rllm::{
builder::{LLMBackend, LLMBuilder},
chat::{ChatMessage, ChatRole},
evaluator::{EvalResult, LLMEvaluator},
};
fn main() -> Result<(), Box<dyn std::error::Error>> {
// Initialize Anthropic provider with Claude model
let anthropic = LLMBuilder::new()
.backend(LLMBackend::Anthropic)
.model("claude-3-5-sonnet-20240620")
.api_key(std::env::var("ANTHROPIC_API_KEY").unwrap_or("anthropic-key".into()))
.build()?;
// Initialize Phind provider specialized for code
let phind = LLMBuilder::new()
.backend(LLMBackend::Phind)
.model("Phind-70B")
.build()?;
// Initialize DeepSeek provider
let deepseek = LLMBuilder::new()
.backend(LLMBackend::DeepSeek)
.model("deepseek-chat")
.api_key(std::env::var("DEEPSEEK_API_KEY").unwrap_or("deepseek-key".into()))
.build()?;
// Create evaluator with multiple scoring functions
let evaluator = LLMEvaluator::new(vec![anthropic, phind, deepseek])
// First scoring function: Evaluate code quality and completeness
.scoring(|response| {
let mut score = 0.0;
// Check for code blocks and specific Rust features
if response.contains("```") {
score += 1.0;
if response.contains("```rust") {
score += 2.0;
}
if response.contains("use actix_web::") {
score += 2.0;
}
if response.contains("async fn") {
score += 1.0;
}
if response.contains("#[derive(") {
score += 1.0;
}
if response.contains("//") {
score += 1.0;
}
}
score
})
// Second scoring function: Evaluate explanation quality
.scoring(|response| {
let mut score = 0.0;
// Check for explanatory phrases
if response.contains("Here's how it works:") || response.contains("Let me explain:") {
score += 2.0;
}
// Check for examples and practical usage
if response.contains("For example") || response.contains("curl") {
score += 1.5;
}
// Reward comprehensive responses
let words = response.split_whitespace().count();
if words > 100 {
score += 1.0;
}
score
});
// Define the evaluation prompt requesting a Rust microservice implementation
let messages = vec![ChatMessage {
role: ChatRole::User,
content: "\
Create a Rust microservice using Actix Web.
It should have at least two routes:
1) A GET route returning a simple JSON status.
2) A POST route that accepts JSON data and responds with a success message.\n\
Include async usage, data structures with `#[derive(Serialize, Deserialize)]`, \
and show how to run it.\n\
Provide code blocks, comments, and a brief explanation of how it works.\
"
.into(),
}];
// Run evaluation across all providers
let results: Vec<EvalResult> = evaluator.evaluate_chat(&messages)?;
// Display results with scores
for (i, item) in results.iter().enumerate() {
println!("\n=== LLM #{} ===", i);
println!("Score: {:.2}", item.score);
println!("Response:\n{}", item.text);
println!("================\n");
}
Ok(())
}