Skip to content

VictoriaTraces Jaeger API Bug - Only spans ending within ~30-60 seconds of trace start are returned. #79

@nikola-etched

Description

@nikola-etched

Describe the bug

VictoriaTraces Jaeger API Bug

Problem

Jaeger API /select/jaeger/api/traces/{traceID} returns incomplete traces. Only spans ending within ~30-60 seconds of trace start are returned.

Evidence

Production trace 4e4a1b65edb8143ce2fa90ee1df9601b:

  • LogsQL: 10 spans
  • Jaeger API: 3 spans

Test trace (see attached script):

  • LogsQL: 5 spans
  • Jaeger API: 3 spans

Reproduce

Run attached reproduce_victoria_traces_bug.py (takes 90 seconds):

pip install requests
python3 reproduce_victoria_traces_bug.py

Result: LogsQL returns all 5 spans, Jaeger API returns only 3.

Impact

Any trace over 1 minute appears incomplete in Grafana. Affects all long-running operations.

Environment

VictoriaTraces v0.4.0

Request

Please fix Jaeger API to return all spans for a trace_id, not just those ending within first 60 seconds.

To Reproduce


#!/usr/bin/env python3
"""
VictoriaTraces Jaeger API Bug - Standalone Reproduction

Bug: Jaeger API only returns spans ending within ~30 seconds of trace start.
Runtime: ~90 seconds
Dependencies: requests (pip install requests)

Usage:
    python3 reproduce_victoria_traces_bug.py

Expected: LogsQL and Jaeger API both return all 5 spans
Actual:   LogsQL returns 5 spans, Jaeger API returns only 2 spans
"""

import json
import random
import time
import requests


# Configuration - CHANGE THIS to your VictoriaTraces endpoint
VICTORIA_TRACES_URL = "https://traces-dev.my-company.com"
VERIFY_SSL = False  # Set to True if you have valid SSL certs


def generate_id(length=16):
    """Generate random hex ID"""
    return ''.join(random.choice('0123456789abcdef') for _ in range(length))


def send_span_otlp(trace_id, span_id, parent_span_id, name, start_time_ns, duration_ns):
    """Send a single span to VictoriaTraces via OTLP"""
    end_time_ns = start_time_ns + duration_ns

    span = {
        "traceId": trace_id,
        "spanId": span_id,
        "name": name,
        "startTimeUnixNano": str(start_time_ns),
        "endTimeUnixNano": str(end_time_ns),
        "kind": 1,
        "attributes": [
            {"key": "service.name", "value": {"stringValue": "bug-repro"}}
        ],
        "status": {"code": 0}
    }

    if parent_span_id:
        span["parentSpanId"] = parent_span_id

    payload = {
        "resourceSpans": [{
            "resource": {
                "attributes": [
                    {"key": "service.name", "value": {"stringValue": "bug-repro"}}
                ]
            },
            "scopeSpans": [{
                "spans": [span]
            }]
        }]
    }

    try:
        response = requests.post(
            f"{VICTORIA_TRACES_URL}/insert/opentelemetry/v1/traces",
            json=payload,
            verify=VERIFY_SSL,
            timeout=5
        )
        response.raise_for_status()
        return True
    except Exception as e:
        print(f"  ERROR sending span {name}: {e}")
        return False


def main():
    print("=" * 80)
    print("VictoriaTraces Jaeger API Bug Reproduction")
    print("=" * 80)
    print()

    # Generate trace and span IDs
    trace_id = generate_id(32)
    parent_id = generate_id(16)
    child_ids = [generate_id(16) for _ in range(4)]

    print(f"Trace ID: {trace_id}")
    print()
    print("Creating trace with 5 spans over 90 seconds...")
    print("  - 1 parent span (0s to 90s)")
    print("  - 4 child spans at 20s intervals")
    print()

    base_time = int(time.time() * 1_000_000_000)  # Current time in nanoseconds

    # Send child spans at 20-second intervals
    for i in range(4):
        elapsed = i * 20
        print(f"[{elapsed:3d}s] Sending child_span_{i+1}...")

        start_time = base_time + (i * 20_000_000_000)
        send_span_otlp(
            trace_id, child_ids[i], parent_id,
            f"child_span_{i+1}",
            start_time,
            100_000_000  # 100ms duration
        )

        if i < 3:  # Don't wait after last child
            time.sleep(20)

    # Send parent span (covers entire trace)
    print(f"[ 90s] Sending parent_span (covers full 90s)...")
    send_span_otlp(
        trace_id, parent_id, None,
        "parent_span",
        base_time,
        90_000_000_000  # 90 seconds
    )

    print()
    print("✓ All 5 spans sent to VictoriaTraces")
    print()

    # Wait for ingestion
    print("Waiting 5 seconds for data ingestion...")
    time.sleep(5)
    print()

    # Query LogsQL
    print("=" * 80)
    print("Querying VictoriaTraces...")
    print("=" * 80)
    print()

    print("[1] LogsQL Query (storage backend):")
    try:
        response = requests.get(
            f"{VICTORIA_TRACES_URL}/select/logsql/query",
            params={"query": f'"trace_id":"{trace_id}"', "limit": 100},
            verify=VERIFY_SSL,
            timeout=10
        )

        lines = response.text.strip().split('\n')
        span_names = set()
        for line in lines:
            try:
                entry = json.loads(line)
                if entry.get('span_id'):
                    span_names.add(entry.get('name', 'unknown'))
            except:
                pass

        print(f"    Returned: {len(span_names)} spans")
        for name in sorted(span_names):
            print(f"      - {name}")
    except Exception as e:
        print(f"    ERROR: {e}")
        span_names = set()

    print()

    # Query Jaeger API
    print("[2] Jaeger API Query:")
    try:
        response = requests.get(
            f"{VICTORIA_TRACES_URL}/select/jaeger/api/traces/{trace_id}",
            verify=VERIFY_SSL,
            timeout=10
        )

        data = response.json()
        jaeger_spans = data.get('data', [{}])[0].get('spans', [])
        jaeger_names = [s['operationName'] for s in jaeger_spans]

        print(f"    Returned: {len(jaeger_spans)} spans")
        for name in sorted(jaeger_names):
            print(f"      - {name}")
    except Exception as e:
        print(f"    ERROR: {e}")
        jaeger_names = []

    print()

    # Results
    print("=" * 80)
    print("RESULTS")
    print("=" * 80)
    print()

    logsql_count = len(span_names)
    jaeger_count = len(jaeger_names)

    print(f"Expected spans:  5")
    print(f"LogsQL returned: {logsql_count} spans")
    print(f"Jaeger API returned: {jaeger_count} spans")
    print()

    if logsql_count == jaeger_count == 5:
        print("✓ SUCCESS: Both APIs returned all spans - no bug detected")
        print()
        print("This could mean:")
        print("  - The bug has been fixed")
        print("  - Test duration too short (try longer trace)")
        print("  - Different VictoriaTraces version")

    elif logsql_count == 5 and jaeger_count < 5:
        print(f"✗ BUG CONFIRMED: Jaeger API missing {5 - jaeger_count} spans!")
        print()

        missing = set(span_names) - set(jaeger_names)
        print("Missing from Jaeger API:")
        for name in sorted(missing):
            print(f"  - {name}")

        print()
        print("DIAGNOSIS:")
        print("  VictoriaTraces Jaeger API has ~30 second time window")
        print("  Spans ending after ~30s from trace start are filtered out")
        print()
        print("Expected: child_span_3, child_span_4, parent_span are missing")
        print("  (They end at 60s, 80s, and 90s respectively)")

    else:
        print(f"⚠ UNEXPECTED: LogsQL={logsql_count}, Jaeger={jaeger_count}")
        print("  Data may not have been ingested properly")

    print()
    print("View in VictoriaTraces UI:")
    print(f"  {VICTORIA_TRACES_URL}/select/vmui/?#/?query=%22trace_id%22%3A%22{trace_id}%22")
    print()


if __name__ == "__main__":
    # Disable SSL warnings
    import urllib3
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    main()

Version

Version victoria-traces-20251014-032256-tags-v0.4.0-0-g8a5f1b618

Logs

The above python code will reproduce on the above version.

I do not have access to the logs on our server.

Screenshots

No response

Used command-line flags

No response

Additional information

No response

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions