-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathProgram.cs
91 lines (76 loc) · 3.2 KB
/
Program.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
// https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.textcatalog.latentdirichletallocation?view=ml-dotnet
using Microsoft.ML;
class LdaInput
{
public string Text { get; set; }
}
class LdaOutput
{
public float[] Features { get; set; }
}
class Program
{
static async Task Main()
{
var connString = "Host=localhost;Database=pgvector_example";
var dataSourceBuilder = new NpgsqlDataSourceBuilder(connString);
dataSourceBuilder.UseVector();
await using var dataSource = dataSourceBuilder.Build();
var conn = dataSource.OpenConnection();
await using (var cmd = new NpgsqlCommand("CREATE EXTENSION IF NOT EXISTS vector", conn))
{
await cmd.ExecuteNonQueryAsync();
}
conn.ReloadTypes();
await using (var cmd = new NpgsqlCommand("DROP TABLE IF EXISTS documents", conn))
{
await cmd.ExecuteNonQueryAsync();
}
await using (var cmd = new NpgsqlCommand("CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(20))", conn))
{
await cmd.ExecuteNonQueryAsync();
}
string[] input = {
"The dog is barking",
"The cat is purring",
"The bear is growling"
};
var embeddings = GenerateEmbeddings(input);
for (int i = 0; i < input.Length; i++)
{
await using (var cmd = new NpgsqlCommand("INSERT INTO documents (content, embedding) VALUES ($1, $2)", conn))
{
cmd.Parameters.AddWithValue(input[i]);
cmd.Parameters.AddWithValue(new Vector(embeddings[i]));
await cmd.ExecuteNonQueryAsync();
}
}
var documentId = 1;
await using (var cmd = new NpgsqlCommand("SELECT * FROM documents WHERE id != $1 ORDER BY embedding <=> (SELECT embedding FROM documents WHERE id = $1) LIMIT 5", conn))
{
cmd.Parameters.AddWithValue(documentId);
await using (var reader = await cmd.ExecuteReaderAsync())
{
while (await reader.ReadAsync())
{
Console.WriteLine((string)reader.GetValue(1));
}
}
}
}
private static float[][] GenerateEmbeddings(string[] texts)
{
var mlContext = new MLContext();
var input = texts.Select((v) => new LdaInput { Text = v });
var dataView = mlContext.Data.LoadFromEnumerable(input);
var pipeline = mlContext.Transforms.Text.NormalizeText("NormalizedText", "Text")
.Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "NormalizedText"))
.Append(mlContext.Transforms.Text.RemoveDefaultStopWords("Tokens"))
.Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
.Append(mlContext.Transforms.Text.ProduceNgrams("Tokens"))
.Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "Tokens", numberOfTopics: 20));
var model = pipeline.Fit(dataView);
var engine = mlContext.Model.CreatePredictionEngine<LdaInput, LdaOutput>(model);
return input.Select((v) => engine.Predict(v).Features).ToArray();
}
}