Skip to content

Commit

Permalink
Merge branch 'main' into mog/self-host-proxy
Browse files Browse the repository at this point in the history
  • Loading branch information
mogery committed Feb 20, 2025
2 parents fbb57cf + dc15015 commit 09d62fa
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 20 deletions.
57 changes: 42 additions & 15 deletions .github/workflows/test-server-self-host.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ jobs:
name: Run tests
strategy:
matrix:
openai: [true, false]
serper: [true, false]
playwright: [true, false]
proxy: [true, false]
ai: ["openai", "no-ai"]
search: ["searxng", "google"]
engine: ["playwright", "fetch"]
proxy: ["proxy", "no-proxy"]
fail-fast: false
runs-on: ubuntu-latest
services:
Expand All @@ -33,12 +33,12 @@ jobs:
ports:
- 6379:6379
env:
OPENAI_API_KEY: ${{ matrix.openai == true && secrets.OPENAI_API_KEY || '' }}
SERPER_API_KEY: ${{ matrix.serper == true && secrets.SERPER_API_KEY || '' }}
PLAYWRIGHT_MICROSERVICE_URL: ${{ matrix.playwright == true && 'http://localhost:3003/scrape' || '' }}
PROXY_SERVER: ${{ matrix.proxy == true && secrets.PROXY_SERVER || '' }}
PROXY_USERNAME: ${{ matrix.proxy == true && secrets.PROXY_SERVER || '' }}
PROXY_PASSWORD: ${{ matrix.proxy == true && secrets.PROXY_PASSWORD || '' }}
OPENAI_API_KEY: ${{ matrix.ai == 'openai' && secrets.OPENAI_API_KEY || '' }}
SEARXNG_ENDPOINT: ${{ matrix.search == 'searxng' && 'http://localhost:3434' || '' }}
PLAYWRIGHT_MICROSERVICE_URL: ${{ matrix.engine == 'playwright' && 'http://localhost:3003/scrape' || '' }}
PROXY_SERVER: ${{ matrix.proxy == 'proxy' && secrets.PROXY_SERVER || '' }}
PROXY_USERNAME: ${{ matrix.proxy == 'proxy' && secrets.PROXY_USERNAME || '' }}
PROXY_PASSWORD: ${{ matrix.proxy == 'proxy' && secrets.PROXY_PASSWORD || '' }}
steps:
- uses: actions/checkout@v3
- name: Install pnpm
Expand All @@ -55,7 +55,7 @@ jobs:
run: pnpm install
working-directory: ./apps/api
- name: Install Playwright dependencies
if: matrix.playwright == true
if: matrix.engine == 'playwright'
run: |
pnpm install
pnpm exec playwright install-deps
Expand All @@ -72,22 +72,36 @@ jobs:
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go
chmod +x html-to-markdown.so
working-directory: ./apps/api/sharedLibs/go-html-to-md
- name: Set up SearXNG
if: matrix.search == 'searxng'
run: |
mkdir searxng
echo "use_default_settings: true
search:
formats: [html, json, csv]
server:
secret_key: 'fcsecret'" > searxng/settings.yml
docker run -d -p 3434:8080 -v "${PWD}/searxng:/etc/searxng" --name searxng searxng/searxng
pnpx wait-on tcp:3434 -t 30s
working-directory: ./
- name: Start server
run: npm start > api.log 2>&1 &
working-directory: ./apps/api
- name: Start worker
run: npm run workers > worker.log 2>&1 &
working-directory: ./apps/api
- name: Start playwright
if: matrix.playwright == true
if: matrix.engine == 'playwright'
run: npm run dev > playwright.log 2>&1 &
working-directory: ./apps/playwright-service-ts
env:
PORT: 3003
- name: Wait for server
run: pnpx wait-on tcp:3002 -t 15s
- name: Wait for playwright
if: matrix.playwright == true
if: matrix.engine == 'playwright'
run: pnpx wait-on tcp:3003 -t 15s
- name: Run snippet tests
run: |
Expand All @@ -96,16 +110,29 @@ jobs:
- name: Kill instances
if: always()
run: pkill -9 node
- name: Kill SearXNG
if: always() && matrix.search == 'searxng'
run: |
docker logs searxng > searxng/searxng.log 2>&1
docker kill searxng
working-directory: ./
- uses: actions/upload-artifact@v4
if: always()
with:
name: Logs (openai ${{ matrix.openai }}, serper ${{ matrix.serper }}, playwright ${{ matrix.playwright }}}, proxy ${{ matrix.proxy }})
name: Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.engine }}, ${{ matrix.proxy }})
path: |
./apps/api/api.log
./apps/api/worker.log
- uses: actions/upload-artifact@v4
if: always() && matrix.playwright
with:
name: Playwright Logs (openai ${{ matrix.openai }}, serper ${{ matrix.serper }}, proxy ${{ matrix.proxy }})
name: Playwright Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.proxy }})
path: |
./apps/playwright-service-ts/playwright.log
- uses: actions/upload-artifact@v4
if: always() && matrix.search == 'searxng'
with:
name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }}, ${{ matrix.proxy }})
path: |
./searxng/searxng.log
./searxng/settings.yml
20 changes: 16 additions & 4 deletions SELF_HOST.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,28 @@ USE_DB_AUTHENTICATION=false
# ===== Optional ENVS ======
# Proxy configuration (this will be used by both fetch and playwright engines)
## === AI features (JSON format on scrape, /extract API) ===
# Provide your OpenAI API key here to enable AI features
# OPENAI_API_KEY=
## === Proxy ===
# PROXY_SERVER can be a full URL (e.g. http://0.1.2.3:1234) or just an IP and port combo (e.g. 0.1.2.3:1234)
# Do not uncomment PROXY_USERNAME and PROXY_PASSWORD if your proxy is unauthenticated
# PROXY_SERVER=
# PROXY_USERNAME=
# PROXY_PASSWORD=
## === /search API ===
# By default, the /search API will use Google search.
# You can specify a SearXNG server with the JSON format enabled, if you'd like to use that instead of direct Google.
# You can also customize the engines and categories parameters, but the defaults should also work just fine.
# SEARXNG_ENDPOINT=http://your.searxng.server
# SEARXNG_ENGINES=
# SEARXNG_CATEGORIES=
## === Other ===
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
# SUPABASE_ANON_TOKEN=
# SUPABASE_URL=
Expand All @@ -65,9 +80,6 @@ USE_DB_AUTHENTICATION=false
# You can add this to enable ScrapingBee as a fallback scraping engine.
# SCRAPING_BEE_API_KEY=
# Needed for JSON format on scrape and /extract endpoint
# OPENAI_API_KEY=
# This key lets you access the queue admin panel. Change this if your deployment is publicly accessible.
BULL_AUTH_KEY=CHANGEME
Expand Down
2 changes: 1 addition & 1 deletion apps/api/src/__tests__/snips/scrape.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ describe("Scrape tests", () => {
url: "https://icanhazip.com"
});

expect(response.markdown?.trim() === process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]).toBe(true);
expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]);
});
}

Expand Down
11 changes: 11 additions & 0 deletions apps/api/src/search/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { googleSearch } from "./googlesearch";
import { fireEngineMap } from "./fireEngine";
import { searchapi_search } from "./searchapi";
import { serper_search } from "./serper";
import { searxng_search } from "./searxng";

export async function search({
query,
Expand Down Expand Up @@ -51,6 +52,16 @@ export async function search({
location,
});
}
if (process.env.SEARXNG_ENDPOINT) {
return await searxng_search(query, {
num_results,
tbs,
filter,
lang,
country,
location,
});
}
return await googleSearch(
query,
advanced,
Expand Down
64 changes: 64 additions & 0 deletions apps/api/src/search/searxng.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import axios from "axios";
import dotenv from "dotenv";
import { SearchResult } from "../../src/lib/entities";
import { logger } from "../lib/logger"

dotenv.config();

interface SearchOptions {
tbs?: string;
filter?: string;
lang?: string;
country?: string;
location?: string;
num_results: number;
page?: number;
}

export async function searxng_search(
q: string,
options: SearchOptions,
): Promise<SearchResult[]> {
const params = {
q: q,
language: options.lang,
// gl: options.country, //not possible with SearXNG
// location: options.location, //not possible with SearXNG
// num: options.num_results, //not possible with SearXNG
engines: process.env.SEARXNG_ENGINES || "",
categories: process.env.SEARXNG_CATEGORIES || "general",
pageno: options.page ?? 1,
format: "json"
};

const url = process.env.SEARXNG_ENDPOINT!;
// Remove trailing slash if it exists
const cleanedUrl = url.endsWith('/') ? url.slice(0, -1) : url;

// Concatenate "/search" to the cleaned URL
const finalUrl = cleanedUrl + "/search";

try {
const response = await axios.get(finalUrl, {
headers: {
"Content-Type": "application/json",
},
params: params,
});

const data = response.data;

if (data && Array.isArray(data.results)) {
return data.results.map((a: any) => ({
url: a.url,
title: a.title,
description: a.content,
}));
} else {
return [];
}
} catch (error) {
logger.error(`There was an error searching for content`, { error });
return [];
}
}

0 comments on commit 09d62fa

Please sign in to comment.