Skip to content

Commit

Permalink
Changes to enable integration between Astronomer, Airflow and Weaviate
Browse files Browse the repository at this point in the history
  • Loading branch information
bismuthsalamander committed Jan 29, 2024
1 parent f82ba90 commit ba48fcf
Show file tree
Hide file tree
Showing 8 changed files with 136 additions and 25 deletions.
13 changes: 13 additions & 0 deletions aiproxy/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# syntax=docker/dockerfile:1.4
FROM --platform=$BUILDPLATFORM python:3.10-alpine AS builder

WORKDIR /app

COPY requirements.txt /app
RUN --mount=type=cache,target=/root/.cache/pip \
pip3 install -r requirements.txt

COPY . /app

ENTRYPOINT ["python3"]
CMD ["app.py"]
56 changes: 56 additions & 0 deletions aiproxy/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import re, os, sys
from urllib.parse import urlparse
import requests
from flask import Flask, request, make_response
app = Flask(__name__)

TARGET_HOST = os.getenv('REDIRECT_PREFIX', '').rstrip('/')
URL_MATCH = os.getenv('URL_MATCH')
URL_REPLACE = os.getenv('URL_REPLACE')
VERBOSE = os.getenv('VERBOSE', 'false') == 'true'
print("Target host: ", TARGET_HOST, file=sys.stderr)
print("URL match: ", URL_MATCH, file=sys.stderr)
print("URL replace: ", URL_REPLACE, file=sys.stderr)
print("Verbose: ", VERBOSE, file=sys.stderr)

@app.route('/', defaults={'path':''}, methods=['GET','POST'])
@app.route('/<path:path>', methods=['GET','POST'])
def index(path):
data = request.get_data()
url = new_url(request)
method = request.method
headers = {}
for k, v in request.headers:
if k.lower() == 'authorization':
headers['api-key'] = v.replace('Bearer ', '')
else:
headers[k] = v
headers['Host'] = urlparse(url).netloc
if len(data) > 0:
received = requests.request(method, url, headers=headers, data=data)
else:
received = requests.request(method, url, headers=headers)
response = make_response(received.content)
for k, v in received.headers.items():
response.headers[k] = v
if VERBOSE:
print("Original headers:", request.headers, file=sys.stderr)
print("Original body:", data, file=sys.stderr)
print("New headers:", headers, file=sys.stderr)
print("New body:", received.content[:1000], file=sys.stderr)
return response

def new_url(request):
orig_url = "%s" % request.path
url = TARGET_HOST + request.path
if len(request.query_string) != 0:
url = url + '?' + request.query_string.decode('ascii')
orig_url += '?' + request.query_string.decode('ascii')
if URL_MATCH is not None and URL_REPLACE is not None:
url = re.sub(URL_MATCH, URL_REPLACE, url)
if VERBOSE:
print("Original URL: %s\nNew URL: %s" % (orig_url, url), file=sys.stderr)
return url

if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
3 changes: 3 additions & 0 deletions aiproxy/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/zsh

docker build . --tag aiproxy
2 changes: 2 additions & 0 deletions aiproxy/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
requests
flask
17 changes: 11 additions & 6 deletions airflow/dags/ingestion/ask-astro-load.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,12 @@ def check_seed_baseline(seed_baseline_url: str = None) -> str | set:
else:
return {
"extract_github_markdown",
"extract_airflow_docs",
#XXX changed manually
#"extract_airflow_docs",
"extract_stack_overflow",
"extract_astro_registry_cell_types",
"extract_github_issues",
#XXX changed manually
#"extract_github_issues",
"extract_astro_blogs",
"extract_astro_registry_dags",
"extract_astro_cli_docs",
Expand Down Expand Up @@ -380,13 +382,15 @@ def import_baseline(
)

md_docs = extract_github_markdown.expand(source=markdown_docs_sources)
issues_docs = extract_github_issues.expand(repo_base=issues_docs_sources)
#XXX changed manually
#issues_docs = extract_github_issues.expand(repo_base=issues_docs_sources)
stackoverflow_docs = extract_stack_overflow.expand(tag=stackoverflow_tags)
registry_cells_docs = extract_astro_registry_cell_types()
blogs_docs = extract_astro_blogs()
registry_dags_docs = extract_astro_registry_dags()
_astro_docs = extract_astronomer_docs()
_airflow_docs = extract_airflow_docs()
#XXX changed manually
#_airflow_docs = extract_airflow_docs()
_astro_cli_docs = extract_astro_cli_docs()
_extract_astro_providers_docs = extract_astro_provider_doc()
_astro_forum_docs = extract_astro_forum_doc()
Expand All @@ -398,14 +402,15 @@ def import_baseline(

markdown_tasks = [
md_docs,
issues_docs,
#issues_docs,
stackoverflow_docs,
blogs_docs,
registry_cells_docs,
]

html_tasks = [
_airflow_docs,
#XXX changed manually
#_airflow_docs,
_astro_cli_docs,
_extract_astro_providers_docs,
_astro_forum_docs,
Expand Down
46 changes: 29 additions & 17 deletions airflow/docker-compose.override.yml
Original file line number Diff line number Diff line change
@@ -1,24 +1,36 @@
version: '3.1'
networks:
ask-astro_astro_network:
external: true
services:
webserver:
ports:
- 8501:8501
networks:
- airflow
weaviate:
image: semitechnologies/weaviate:1.21.0
command: "--host 0.0.0.0 --port '8081' --scheme http"
volumes:
- ${PWD}/include/weaviate/backup:/var/lib/weaviate/backup
environment:
QUERY_DEFAULTS_LIMIT: 25
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
DEFAULT_VECTORIZER_MODULE: 'text2vec-openai'
ENABLE_MODULES: 'text2vec-openai, backup-filesystem, qna-openai, generative-openai, text2vec-cohere, reranker-cohere'
BACKUP_FILESYSTEM_PATH: '/var/lib/weaviate/backup'
CLUSTER_HOSTNAME: 'node1'
ports:
- 8081:8081
- ask-astro_astro_network
triggerer:
networks:
- ask-astro_astro_network
scheduler:
networks:
- ask-astro_astro_network
postgres:
networks:
- airflow
- ask-astro_astro_network
# weaviate:
# image: semitechnologies/weaviate:1.21.0
# command: "--host 0.0.0.0 --port '8081' --scheme http"
# volumes:
# - ${PWD}/include/weaviate/backup:/var/lib/weaviate/backup
# environment:
# QUERY_DEFAULTS_LIMIT: 25
# AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
# PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
# DEFAULT_VECTORIZER_MODULE: 'text2vec-openai'
# ENABLE_MODULES: 'text2vec-openai, backup-filesystem, qna-openai, generative-openai, text2vec-cohere, reranker-cohere'
# BACKUP_FILESYSTEM_PATH: '/var/lib/weaviate/backup'
# CLUSTER_HOSTNAME: 'node1'
# ports:
# - 8081:8081
# networks:
# - airflow
Empty file modified api/build.sh
100644 → 100755
Empty file.
24 changes: 22 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
---
version: '3.4'
networks:
astro_network:
driver: bridge
services:
astro-api:
networks:
- astro_network
image: ask-astro-1.0.0-dev
ports:
- 8080:8080
- 8085:8080
environment:
FIRESTORE_INSTALLATION_STORE_COLLECTION: fb-astro-install
FIRESTORE_STATE_STORE_COLLECTION: fb-astro-state
Expand All @@ -27,7 +32,8 @@ services:
LANGCHAIN_ENDPOINT: nothing
LANGCHAIN_API_KEY: nothing
OPENAI_API_KEY: <OPENAI_API_KEY>
WEAVIATE_URL: http://weaviate:8080
OPENAI_BASE_URL: http://aiproxy:5000
WEAVIATE_URL: http://weaviate:8085
WEAVIATE_API_KEY: test
WEAVIATE_INDEX_NAME: Weaviateidx
WEAVIATE_TEXT_KEY: textkey
Expand All @@ -36,6 +42,8 @@ services:
MULTI_QUERY_RETRIEVER_DEPLOYMENT_NAME: gpt-4
WEAVIATE_CREATE_SCHEMA_IF_MISSING: true
weaviate:
networks:
- astro_network
image: semitechnologies/weaviate:1.23.4
ports:
- 8088:8080
Expand All @@ -50,6 +58,18 @@ services:
DEFAULT_VECTORIZER_MODULE: 'none'
ENABLE_MODULES: 'text2vec-cohere,text2vec-huggingface,text2vec-palm,text2vec-openai,generative-openai,generative-cohere,generative-palm,ref2vec-centroid,reranker-cohere,qna-openai'
CLUSTER_HOSTNAME: 'node1'
aiproxy:
networks:
- astro_network
image: aiproxy
ports:
- 5001:5000
environment:
REDIRECT_PREFIX: https://tob-internal-chatgpt.openai.azure.com
#URL_MATCH: 2022-..-..
#URL_REPLACE: 2023-07-01-preview
URL_MATCH: /v1/embeddings
URL_REPLACE: /openai/deployments/text-embedding-ada-002/embeddings?api-version=2023-07-01-preview
volumes:
weaviate_data:
...

0 comments on commit ba48fcf

Please sign in to comment.