Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions backend/app/database/weaviate/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from .operations import (
store_user_profile,
search_similar_contributors,
search_contributors_by_keywords,
get_contributor_profile,
WeaviateUserOperations
)

from .client import get_weaviate_client

__all__ = [
"store_user_profile",
"search_similar_contributors",
"search_contributors_by_keywords",
"get_contributor_profile",
"WeaviateUserOperations",
"get_weaviate_client"
]
191 changes: 180 additions & 11 deletions backend/app/database/weaviate/operations.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import logging
import json
from typing import Optional, Dict, Any
from typing import Optional, Dict, Any, List
from datetime import datetime, timezone
from app.models.database.weaviate import WeaviateUserProfile
from app.database.weaviate.client import get_weaviate_client
import weaviate.exceptions as weaviate_exceptions
import weaviate.classes as wvc
from weaviate.classes.query import Filter

logger = logging.getLogger(__name__)

Expand All @@ -26,7 +27,7 @@ async def find_user_by_id(self, user_id: str) -> Optional[str]:
collection = client.collections.get(self.collection_name)

response = await collection.query.fetch_objects(
where=wvc.query.Filter.by_property("user_id").equal(user_id),
filters=Filter.by_property("user_id").equal(user_id),
limit=1
)

Expand All @@ -43,7 +44,7 @@ async def find_user_by_id(self, user_id: str) -> Optional[str]:
logger.error(f"Unexpected error finding user by ID: {str(e)}")
return None

async def create_user_profile(self, profile: WeaviateUserProfile) -> bool:
async def create_user_profile(self, profile: WeaviateUserProfile, embedding_vector: List[float]) -> bool:
"""
Create a new user profile in Weaviate.
"""
Expand All @@ -54,7 +55,8 @@ async def create_user_profile(self, profile: WeaviateUserProfile) -> bool:
collection = client.collections.get(self.collection_name)

result = await collection.data.insert(
properties=profile_dict
properties=profile_dict,
vector=embedding_vector
)

logger.info(f"Created user profile for {profile.github_username} with UUID: {result}")
Expand All @@ -67,7 +69,7 @@ async def create_user_profile(self, profile: WeaviateUserProfile) -> bool:
logger.error(f"Unexpected error creating user profile: {str(e)}")
return False

async def update_user_profile(self, uuid: str, profile: WeaviateUserProfile) -> bool:
async def update_user_profile(self, uuid: str, profile: WeaviateUserProfile, embedding_vector: List[float]) -> bool:
"""
Update an existing user profile in Weaviate.
"""
Expand All @@ -78,7 +80,8 @@ async def update_user_profile(self, uuid: str, profile: WeaviateUserProfile) ->
collection = client.collections.get(self.collection_name)
await collection.data.update(
uuid=uuid,
properties=profile_dict
properties=profile_dict,
vector=embedding_vector
)

logger.info(f"Updated user profile for {profile.github_username} with UUID: {uuid}")
Expand All @@ -91,7 +94,7 @@ async def update_user_profile(self, uuid: str, profile: WeaviateUserProfile) ->
logger.error(f"Unexpected error updating user profile: {str(e)}")
return False

async def upsert_user_profile(self, profile: WeaviateUserProfile) -> bool:
async def upsert_user_profile(self, profile: WeaviateUserProfile, embedding_vector: List[float]) -> bool:
"""
Create or update a user profile (upsert operation).
"""
Expand All @@ -100,15 +103,162 @@ async def upsert_user_profile(self, profile: WeaviateUserProfile) -> bool:

if existing_uuid:
logger.info(f"Updating existing profile for user_id: {profile.user_id}")
return await self.update_user_profile(existing_uuid, profile)
return await self.update_user_profile(existing_uuid, profile, embedding_vector)
else:
logger.info(f"Creating new profile for user_id: {profile.user_id}")
return await self.create_user_profile(profile)
return await self.create_user_profile(profile, embedding_vector)

except Exception as e:
logger.error(f"Error in upsert operation: {str(e)}")
return False

async def search_similar_contributors(self, query_embedding: List[float], limit: int = 10, min_distance: float = 0.7) -> List[Dict[str, Any]]:
"""Search for similar contributors using vector similarity search."""
try:
logger.info(f"Searching for similar contributors with embedding dimension: {len(query_embedding)}")

async with get_weaviate_client() as client:
collection = client.collections.get(self.collection_name)

response = await collection.query.near_vector(
near_vector=query_embedding,
limit=limit,
distance=min_distance,
return_metadata=wvc.query.MetadataQuery(distance=True)
)

results = []
for obj in response.objects:
try:
properties = obj.properties
distance = obj.metadata.distance if obj.metadata and obj.metadata.distance else 1.0
similarity_score = 1.0 - distance

result = {
"user_id": properties.get("user_id"),
"github_username": properties.get("github_username"),
"display_name": properties.get("display_name"),
"bio": properties.get("bio"),
"languages": properties.get("languages", []),
"topics": properties.get("topics", []),
"followers_count": properties.get("followers_count", 0),
"total_stars_received": properties.get("total_stars_received", 0),
"similarity_score": similarity_score,
"distance": distance,
"profile_summary": properties.get("profile_text_for_embedding", "")
}
results.append(result)

except Exception as e:
logger.warning(f"Error processing search result: {str(e)}")
continue

logger.info(f"Found {len(results)} similar contributors")
return results

except weaviate_exceptions.WeaviateBaseError as e:
logger.error(f"Weaviate error in similarity search: {str(e)}")
return []
except Exception as e:
logger.error(f"Unexpected error in similarity search: {str(e)}")
return []

async def search_contributors_by_keywords(self, keywords: List[str], limit: int = 10) -> List[Dict[str, Any]]:
"""Search for contributors using keyword matching on profile text, languages, and topics."""
try:
logger.info(f"Searching for contributors with keywords: {keywords}")

async with get_weaviate_client() as client:
collection = client.collections.get(self.collection_name)

keyword_query = " ".join(keywords)

response = await collection.query.bm25(
query=keyword_query,
limit=limit,
return_metadata=wvc.query.MetadataQuery(score=True)
)

results = []
for obj in response.objects:
try:
properties = obj.properties
score = obj.metadata.score if obj.metadata and obj.metadata.score else 0.0

result = {
"user_id": properties.get("user_id"),
"github_username": properties.get("github_username"),
"display_name": properties.get("display_name"),
"bio": properties.get("bio"),
"languages": properties.get("languages", []),
"topics": properties.get("topics", []),
"followers_count": properties.get("followers_count", 0),
"total_stars_received": properties.get("total_stars_received", 0),
"search_score": score,
"profile_summary": properties.get("profile_text_for_embedding", "")
}
results.append(result)

except Exception as e:
logger.warning(f"Error processing keyword search result: {str(e)}")
continue

logger.info(f"Found {len(results)} contributors matching keywords")
return results

except weaviate_exceptions.WeaviateBaseError as e:
logger.error(f"Weaviate error in keyword search: {str(e)}")
return []
except Exception as e:
logger.error(f"Unexpected error in keyword search: {str(e)}")
return []

# TODO: Add hybrid search for contributors. Default in built hybrid search doesn't support custom vectors.

async def get_contributor_profile(self, github_username: str) -> Optional[WeaviateUserProfile]:
"""Get a specific contributor's profile by GitHub username."""
try:
async with get_weaviate_client() as client:
collection = client.collections.get(self.collection_name)

response = await collection.query.fetch_objects(
filters=Filter.by_property("github_username").equal(github_username),
limit=1
)

if response.objects:
properties = response.objects[0].properties

repositories = json.loads(properties.get("repositories", "[]"))
pull_requests = json.loads(properties.get("pull_requests", "[]"))

return WeaviateUserProfile(
user_id=properties.get("user_id"),
github_username=properties.get("github_username"),
display_name=properties.get("display_name"),
bio=properties.get("bio"),
location=properties.get("location"),
languages=properties.get("languages", []),
topics=properties.get("topics", []),
followers_count=properties.get("followers_count", 0),
following_count=properties.get("following_count", 0),
total_stars_received=properties.get("total_stars_received", 0),
total_forks=properties.get("total_forks", 0),
repositories=repositories,
pull_requests=pull_requests,
profile_text_for_embedding=properties.get("profile_text_for_embedding", ""),
last_updated=properties.get("last_updated")
)

return None

except weaviate_exceptions.WeaviateBaseError as e:
logger.error(f"Weaviate error getting contributor profile: {str(e)}")
return None
except Exception as e:
logger.error(f"Unexpected error getting contributor profile: {str(e)}")
return None

def _prepare_profile_data(self, profile: WeaviateUserProfile) -> Dict[str, Any]:
"""
Prepare profile data for Weaviate storage.
Expand All @@ -128,9 +278,28 @@ def _prepare_profile_data(self, profile: WeaviateUserProfile) -> Dict[str, Any]:
return profile_dict


async def store_user_profile(profile: WeaviateUserProfile) -> bool:
async def store_user_profile(profile: WeaviateUserProfile, embedding_vector: List[float]) -> bool:
"""
Convenience function to store or update a user profile.
"""
operations = WeaviateUserOperations()
return await operations.upsert_user_profile(profile)
return await operations.upsert_user_profile(profile, embedding_vector)

async def search_similar_contributors(query_embedding: List[float], limit: int = 10, min_distance: float = 0.7) -> List[Dict[str, Any]]:
"""
Convenience function to search for similar contributors using vector similarity.
"""
operations = WeaviateUserOperations()
return await operations.search_similar_contributors(query_embedding, limit, min_distance)

async def search_contributors_by_keywords(keywords: List[str], limit: int = 10) -> List[Dict[str, Any]]:
"""
Convenience function to search for contributors using keyword matching.
"""
operations = WeaviateUserOperations()
return await operations.search_contributors_by_keywords(keywords, limit)

async def get_contributor_profile(github_username: str) -> Optional[WeaviateUserProfile]:
"""Convenience function to get a contributor's profile by GitHub username."""
operations = WeaviateUserOperations()
return await operations.get_contributor_profile(github_username)
1 change: 1 addition & 0 deletions backend/app/database/weaviate/scripts/create_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ async def create_schema(client, name, properties):
await client.collections.create(
name=name,
properties=properties,
vectorizer_config=wc.Configure.Vectorizer.none()
)
print(f"Created: {name}")

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
PROFILE_SUMMARIZATION_PROMPT = """You are a GitHub profile summarizer for a developer contributor recommendation system. Your task is to create a concise, keyword-rich summary optimized for semantic search and contributor matching. The summary should highlight the developer's technical expertise, recent contributions, and key projects to enable accurate and relevant recommendations.

PROFILE DATA:
- GitHub Username: {github_username}
- Bio: {bio}
- Languages: {languages}
- Recent Pull Requests: {pull_requests}
- Topics/Skills: {topics}
- Stats: {stats}

INSTRUCTIONS:
- Length: Maximum 150-200 words (at maximum 500 tokens).
- Lead with Top Skills: Start with the developer's most prominent technical skills and programming languages (e.g., Python, JavaScript, ML, AI).
- Focus on Recent Expertise: Emphasize areas of active, recent involvement, especially from pull requests and recent work.
- Include Key Projects/Organizations: Mention the most relevant projects or organizations the developer has contributed to.
- Use Specific Technology Names: Incorporate precise terms like frameworks, tools, and methodologies (e.g., React, TensorFlow, DevOps).
- Prioritize Pull Request Skills: Highlight skills and technologies mentioned in recent pull requests, as they reflect current expertise.
- Style: Write in a technical, keyword-dense style. Use action verbs and quantifiable achievements where possible (e.g., "Led development of...," "Optimized performance by...").
- Tone: Professional and focused. Avoid filler content; every word should support search relevance.
- Format: Plain text, no formatting elements (e.g., bullet points, bold text).

GOAL: Create a summary that is easily parsed by search algorithms, rich in relevant keywords, and clearly showcases the developer's technical strengths and recent contributions.

Create a focused, search-optimized profile summary in plain text format:"""
Loading