Skip to content

Commit

Permalink
better naming in index
Browse files Browse the repository at this point in the history
  • Loading branch information
wsxiaoys committed May 23, 2024
1 parent b07a5b6 commit 612456a
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 29 deletions.
2 changes: 1 addition & 1 deletion crates/tabby-scheduler/src/code/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ fn get_git_hash(path: &Path) -> Result<String> {
}

#[derive(Deserialize, Serialize, Debug)]
pub struct SourceFileKey {
struct SourceFileKey {
path: PathBuf,
language: String,
git_hash: String,
Expand Down
21 changes: 13 additions & 8 deletions crates/tabby-scheduler/src/code/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use kv::Batch;
use tabby_common::config::RepositoryConfig;
use tracing::warn;

use super::{cache::CacheStore, create_code_index, intelligence::SourceCode};
use super::{cache::CacheStore, create_code_index, intelligence::SourceCode, KeyedSourceCode};
use crate::Indexer;

// Magic numbers
Expand All @@ -25,7 +25,7 @@ pub fn garbage_collection(cache: &mut CacheStore) {
async fn add_changed_documents(
cache: &mut CacheStore,
repository: &RepositoryConfig,
index: &Indexer<SourceCode>,
index: &Indexer<KeyedSourceCode>,
) {
let mut indexed_files_batch = Batch::new();
for file in Walk::new(repository.dir()) {
Expand All @@ -36,28 +36,33 @@ async fn add_changed_documents(
continue;
}
};
let Some(source_file) = cache.get_source_file(repository, file.path()) else {
let Some(code) = cache.get_source_file(repository, file.path()) else {
continue;
};
if !is_valid_file(&source_file) {
if !is_valid_file(&code) {
continue;
}

let (file_id, indexed) = cache.check_indexed(file.path());
let (key, indexed) = cache.check_indexed(file.path());
if indexed {
continue;
}
index.add(source_file).await;
index
.add(KeyedSourceCode {
key: key.clone(),
code,
})
.await;
indexed_files_batch
.set(&file_id, &String::new())
.set(&key, &String::new())
.expect("Failed to mark file as indexed");
}

// Mark all indexed documents as indexed
cache.apply_indexed(indexed_files_batch);
}

fn remove_staled_documents(cache: &mut CacheStore, index: &Indexer<SourceCode>) {
fn remove_staled_documents(cache: &mut CacheStore, index: &Indexer<KeyedSourceCode>) {
// Create a new writer to commit deletion of removed indexed files
let gc_commit = cache.prepare_garbage_collection_for_indexed_files(|key| {
index.delete(key);
Expand Down
38 changes: 20 additions & 18 deletions crates/tabby-scheduler/src/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ use serde_json::json;
use tabby_common::{config::RepositoryConfig, index::code};
use tracing::{info, warn};

use self::{cache::SourceFileKey, intelligence::SourceCode};
use crate::{code::intelligence::CodeIntelligence, Indexer, IndexAttributeBuilder};
use self::intelligence::SourceCode;
use crate::{code::intelligence::CodeIntelligence, IndexAttributeBuilder, Indexer};

/// Module for creating code search index.
mod cache;
Expand Down Expand Up @@ -41,51 +41,53 @@ impl CodeIndex {
}
}

struct KeyedSourceCode {
key: String,
code: SourceCode,
}

struct CodeBuilder;

#[async_trait]
impl IndexAttributeBuilder<SourceCode> for CodeBuilder {
impl IndexAttributeBuilder<KeyedSourceCode> for CodeBuilder {
fn format_id(&self, id: &str) -> String {
format!("code:{}", id)
}

async fn build_id(&self, source_code: &SourceCode) -> String {
let path = source_code.absolute_path();
let id = SourceFileKey::try_from(path.as_path())
.expect("Failed to build ID from path")
.to_string();
self.format_id(&id)
async fn build_id(&self, source_code: &KeyedSourceCode) -> String {
self.format_id(&source_code.key)
}

async fn build_attributes(&self, _source_code: &SourceCode) -> serde_json::Value {
async fn build_attributes(&self, _source_code: &KeyedSourceCode) -> serde_json::Value {
json!({})
}

async fn build_chunk_attributes(
&self,
source_file: &SourceCode,
source_code: &KeyedSourceCode,
) -> BoxStream<(Vec<String>, serde_json::Value)> {
let text = match source_file.read_content() {
let source_code = &source_code.code;
let text = match source_code.read_content() {
Ok(content) => content,
Err(e) => {
warn!(
"Failed to read content of '{}': {}",
source_file.filepath, e
source_code.filepath, e
);

return Box::pin(futures::stream::empty());
}
};

let source_file = source_file.clone();
let source_code = source_code.clone();
let s = stream! {
let intelligence = CodeIntelligence::default();
for (start_line, body) in intelligence.chunks(&text) {
let tokens = code::tokenize_code(body);
yield (tokens, json!({
code::fields::CHUNK_FILEPATH: source_file.filepath,
code::fields::CHUNK_GIT_URL: source_file.git_url,
code::fields::CHUNK_LANGUAGE: source_file.language,
code::fields::CHUNK_FILEPATH: source_code.filepath,
code::fields::CHUNK_GIT_URL: source_code.git_url,
code::fields::CHUNK_LANGUAGE: source_code.language,
code::fields::CHUNK_BODY: body,
code::fields::CHUNK_START_LINE: start_line,
}));
Expand All @@ -96,7 +98,7 @@ impl IndexAttributeBuilder<SourceCode> for CodeBuilder {
}
}

fn create_code_index() -> Indexer<SourceCode> {
fn create_code_index() -> Indexer<KeyedSourceCode> {
let builder = CodeBuilder;
Indexer::new(builder)
}
2 changes: 1 addition & 1 deletion crates/tabby-scheduler/src/doc/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use tantivy::doc;
use text_splitter::TextSplitter;
use tracing::warn;

use crate::{Indexer, IndexAttributeBuilder};
use crate::{IndexAttributeBuilder, Indexer};

pub struct SourceDocument {
pub id: String,
Expand Down
2 changes: 1 addition & 1 deletion crates/tabby-scheduler/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ pub use code::CodeIndex;
use crawl::crawl_pipeline;
use doc::SourceDocument;
use futures::StreamExt;
use index::{Indexer, IndexAttributeBuilder};
use index::{IndexAttributeBuilder, Indexer};

mod doc;
use std::{env, sync::Arc};
Expand Down

0 comments on commit 612456a

Please sign in to comment.