Skip to content

Commit

Permalink
feat(self-host): proxy support (FIR-1111) (#1212)
Browse files Browse the repository at this point in the history
* feat(self-host): proxy support

* fix(playwright-service-ts): return untreated text/plain
  • Loading branch information
mogery authored Feb 20, 2025
1 parent c75522f commit c38dcd0
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 11 deletions.
10 changes: 7 additions & 3 deletions .github/workflows/test-server-self-host.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ jobs:
ai: ["openai", "no-ai"]
search: ["searxng", "google"]
engine: ["playwright", "fetch"]
proxy: ["proxy", "no-proxy"]
fail-fast: false
runs-on: ubuntu-latest
services:
Expand All @@ -35,6 +36,9 @@ jobs:
OPENAI_API_KEY: ${{ matrix.ai == 'openai' && secrets.OPENAI_API_KEY || '' }}
SEARXNG_ENDPOINT: ${{ matrix.search == 'searxng' && 'http://localhost:3434' || '' }}
PLAYWRIGHT_MICROSERVICE_URL: ${{ matrix.engine == 'playwright' && 'http://localhost:3003/scrape' || '' }}
PROXY_SERVER: ${{ matrix.proxy == 'proxy' && secrets.PROXY_SERVER || '' }}
PROXY_USERNAME: ${{ matrix.proxy == 'proxy' && secrets.PROXY_USERNAME || '' }}
PROXY_PASSWORD: ${{ matrix.proxy == 'proxy' && secrets.PROXY_PASSWORD || '' }}
steps:
- uses: actions/checkout@v3
- name: Install pnpm
Expand Down Expand Up @@ -115,20 +119,20 @@ jobs:
- uses: actions/upload-artifact@v4
if: always()
with:
name: Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.engine }})
name: Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.engine }}, ${{ matrix.proxy }})
path: |
./apps/api/api.log
./apps/api/worker.log
- uses: actions/upload-artifact@v4
if: always() && matrix.playwright
with:
name: Playwright Logs (${{ matrix.ai }}, ${{ matrix.search }})
name: Playwright Logs (${{ matrix.ai }}, ${{ matrix.search }}, ${{ matrix.proxy }})
path: |
./apps/playwright-service-ts/playwright.log
- uses: actions/upload-artifact@v4
if: always() && matrix.search == 'searxng'
with:
name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }})
name: SearXNG (${{ matrix.ai }}, ${{ matrix.engine }}, ${{ matrix.proxy }})
path: |
./searxng/searxng.log
./searxng/settings.yml
7 changes: 7 additions & 0 deletions SELF_HOST.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ USE_DB_AUTHENTICATION=false
# Provide your OpenAI API key here to enable AI features
# OPENAI_API_KEY=
## === Proxy ===
# PROXY_SERVER can be a full URL (e.g. http://0.1.2.3:1234) or just an IP and port combo (e.g. 0.1.2.3:1234)
# Do not uncomment PROXY_USERNAME and PROXY_PASSWORD if your proxy is unauthenticated
# PROXY_SERVER=
# PROXY_USERNAME=
# PROXY_PASSWORD=
## === /search API ===
# By default, the /search API will use Google search.
Expand Down
10 changes: 10 additions & 0 deletions apps/api/src/__tests__/snips/scrape.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ describe("Scrape tests", () => {
expect(response.markdown).toContain("Firecrawl");
}, 10000);

if (process.env.TEST_SUITE_SELF_HOSTED && process.env.PROXY_SERVER) {
it.concurrent("self-hosted proxy works", async () => {
const response = await scrape({
url: "https://icanhazip.com"
});

expect(response.markdown?.trim()).toBe(process.env.PROXY_SERVER!.split("://").slice(-1)[0].split(":")[0]);
});
}

if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.PLAYWRIGHT_MICROSERVICE_URL) {
it.concurrent("waitFor works", async () => {
const response = await scrape({
Expand Down
14 changes: 12 additions & 2 deletions apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,24 @@ export function makeSecureDispatcher(
url: string,
options?: undici.Agent.Options,
) {
const agent = new undici.Agent({
const agentOpts: undici.Agent.Options = {
connect: {
rejectUnauthorized: false, // bypass SSL failures -- this is fine
// lookup: secureLookup,
},
maxRedirections: 5000,
...options,
});
};

const agent = process.env.PROXY_SERVER
? new undici.ProxyAgent({
uri: process.env.PROXY_SERVER.includes("://") ? process.env.PROXY_SERVER : ("http://" + process.env.PROXY_SERVER),
token: process.env.PROXY_USERNAME
? `Basic ${Buffer.from(process.env.PROXY_USERNAME + ":" + (process.env.PROXY_PASSWORD ?? "")).toString("base64")}`
: undefined,
...agentOpts,
})
: new undici.Agent(agentOpts);

agent.on("connect", (_, targets) => {
const client: undici.Client = targets.slice(-1)[0] as undici.Client;
Expand Down
2 changes: 1 addition & 1 deletion apps/playwright-service-ts/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ const scrapePage = async (page: Page, url: string, waitUntil: 'load' | 'networki
if (response) {
headers = await response.allHeaders();
const ct = Object.entries(headers).find(x => x[0].toLowerCase() === "content-type");
if (ct && ct[1].includes("application/json")) {
if (ct && (ct[1].includes("application/json") || ct[1].includes("text/plain"))) {
content = (await response.body()).toString("utf8"); // TODO: determine real encoding
}
}
Expand Down
16 changes: 11 additions & 5 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ services:
playwright-service:
build: apps/playwright-service-ts
environment:
- PORT=3000
- PROXY_SERVER=${PROXY_SERVER}
- PROXY_USERNAME=${PROXY_USERNAME}
- PROXY_PASSWORD=${PROXY_PASSWORD}
- BLOCK_MEDIA=${BLOCK_MEDIA}
PORT: 3000
PROXY_SERVER: ${PROXY_SERVER}
PROXY_USERNAME: ${PROXY_USERNAME}
PROXY_PASSWORD: ${PROXY_PASSWORD}
BLOCK_MEDIA: ${BLOCK_MEDIA}
networks:
- backend

Expand Down Expand Up @@ -51,6 +51,9 @@ services:
SERPER_API_KEY: ${SERPER_API_KEY}
SEARCHAPI_API_KEY: ${SEARCHAPI_API_KEY}
LOGGING_LEVEL: ${LOGGING_LEVEL}
PROXY_SERVER: ${PROXY_SERVER}
PROXY_USERNAME: ${PROXY_USERNAME}
PROXY_PASSWORD: ${PROXY_PASSWORD}
FLY_PROCESS_GROUP: app
depends_on:
- redis
Expand Down Expand Up @@ -85,6 +88,9 @@ services:
HOST: ${HOST:-0.0.0.0}
SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL}
LOGGING_LEVEL: ${LOGGING_LEVEL}
PROXY_SERVER: ${PROXY_SERVER}
PROXY_USERNAME: ${PROXY_USERNAME}
PROXY_PASSWORD: ${PROXY_PASSWORD}
FLY_PROCESS_GROUP: worker
depends_on:
- redis
Expand Down

0 comments on commit c38dcd0

Please sign in to comment.