Why aren't my Go http services' otel traces being linked from client to server despite sharing a trace_id

83 Views Asked by At

I'm trying to figure out why my traces aren't being linked, despite having the same trace_id between my http client and server.

Client:

enter image description here

Server:

enter image description here

Why is this a missing span?!?!

On the client side i'm injecting the tracing information and the trace_id does match between the services. Here's the client request code:

func triggerPlaywrightTests(ctx context.Context) (err error) {
    subCtx, subSpan := SpanCreator(ctx, "", "triggerPlaywrightTests")
    defer subSpan.End()
    defer subCtx.Done()

    rawhost, _ := os.Hostname()
    hostSplits := strings.Split(rawhost, ".")
    host := strings.ToLower(hostSplits[0])
    ip := GetLocalIP()

    env, err := ValueFromContextKey(subCtx, "environment")
    cust, err := ValueFromContextKey(subCtx, "customer")
    if err != nil {
        return err
    }

    if host == "" || env == "" || cust == "" || ip == "" {
        err = fmt.Errorf("either host, env, cust, or ip are not found in passed ctx")
        return err
    }

    requestData := TriggerPlaywrightTestsRequest{
        Env:      env,
        Customer: cust,
        Hostname: host,
        IP:       ip,
    }

    jsonData, err := json.Marshal(requestData)
    if err != nil {
        return err
    }

    reqCtx, cancel := context.WithTimeout(subCtx, 30*time.Second)
    defer cancel()

    client := &http.Client{
        Transport: otelhttp.NewTransport(http.DefaultTransport),
    }

    req, err := http.NewRequestWithContext(reqCtx, "POST", triggerPlaywrightTestsUrl, bytes.NewBuffer(jsonData))
    if err != nil {
        return err
    }

    req.Header.Set("Content-Type", "application/json")

    resp, err := client.Do(req)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    responseBody, err := io.ReadAll(resp.Body)
    if err != nil {
        return err
    }

    return err
}

On the server side it's an Azure Function with an http trigger. I can tell that the traceparent header exists okay. But why is it saying there's a missing root span?

Here's the server side http server:

main(){
...

    tp, err := createStdoutAndHTTPOTELExporter(ctx, azureFunctionsHoneycombEnvAPIKey)
    if err != nil {
        log.Fatalf("failed to create otel exporters %v", err)
    }

    middleware := otelhttp.NewMiddleware("handleRequest1")
    wrappedHandler := middleware(http.HandlerFunc(handleRequest))

    http.Handle("/api/my-function-url", wrappedHandler)

    listenAddr := ":8080"
    if val, ok := os.LookupEnv("FUNCTIONS_CUSTOMHANDLER_PORT"); ok {
        listenAddr = ":" + val
    }

    log.Fatal(http.ListenAndServe(listenAddr, nil))
}

The trace provider:

func createStdoutAndHTTPOTELExporter(ctx context.Context, azureFunctionsHoneycombEnvAPIKey string) (*sdktrace.TracerProvider, error) {

    var err error
    resource := createOTELResource(Version, BuildTime, VCSTag)
    
    ...

    httpExporter, err := otlphttp.New(
        ctx,
        otlphttp.WithEndpoint("honeycomb-refinery-url:443"),
        otlphttp.WithHeaders(map[string]string{
            "x-honeycomb-team": azureFunctionsHoneycombEnvAPIKey,
        }),
    )
    if err != nil {
        return nil, fmt.Errorf("failed to create http exporter: %v", err)
    }

    var tracerProviderOptions []sdktrace.TracerProviderOption

    tracerProviderOptions = append(tracerProviderOptions, sdktrace.WithBatcher(stdoutExporter), sdktrace.WithBatcher(httpExporter))

    spanLimits := sdktrace.NewSpanLimits()
    spanLimits.EventCountLimit = 1000

    tracerProviderOptions = append(tracerProviderOptions,
        sdktrace.WithRawSpanLimits(spanLimits),
        sdktrace.WithResource(resource))
    tp = sdktrace.NewTracerProvider(tracerProviderOptions...)

    otel.SetTracerProvider(tp)
    otel.SetTextMapPropagator(
        propagation.NewCompositeTextMapPropagator(
            propagation.TraceContext{},
            propagation.Baggage{},
        ),
    )
    return tp, err
}

Here's the request handler:

func handleRequest(w http.ResponseWriter, r *http.Request) {
    ctx := r.Context()
    ctx, span := otel.Tracer("").Start(ctx, "handleRequest")
    defer span.End()

    traceparent := r.Header.Get("traceparent")
    span.AddEvent("inbound header traceparent: " + traceparent)

    fmt.Fprintln(w, "r.Method:", r.Method)
    span.AddEvent("request method: " + r.Method)

    if r.Method != "POST" && r.Method != "GET" {
        http.Error(w, "invalid request method. Not handling request", http.StatusMethodNotAllowed)
        err := fmt.Errorf("invalid request method (not POST or GET): %s", r.Method)
        span.RecordError(err)
        span.SetStatus(codes.Error, err.Error())
        return
    }

    defer r.Body.Close()
    var data InboundRequestData
    err := json.NewDecoder(r.Body).Decode(&data)
    if err != nil {
        http.Error(w, err.Error(), http.StatusBadRequest)
        span.RecordError(err)
        span.SetStatus(codes.Error, err.Error())
        return
    }

    span.SetAttributes(attribute.String("http.inboundrequest.ip", data.IP))
    span.SetAttributes(attribute.String("http.inboundrequest.hostname", data.Hostname))

    if data.Customer == "" || data.Env == "" || data.IP == "" || data.Hostname == "" {
        http.Error(w, "ip, hostname, customer, and env must be set in request body", http.StatusBadRequest)
        err := fmt.Errorf("ip, hostname, customer, and env must be set in request body")
        span.RecordError(err)
        span.SetStatus(codes.Error, err.Error())
        return
    }

    ctx, jwt, err := generateJWT(ctx, AzureKeyVaultName, GitHubAppPrivateKeyAzureKeyName, GitHubAppID)
    if err != nil {
        http.Error(w, err.Error(), http.StatusInternalServerError)
        span.RecordError(err)
        span.SetStatus(codes.Error, err.Error())
        return
    }

    ctx, accessTokenURL, err := getAccessTokenURL(ctx, jwt)
    if err != nil {
        http.Error(w, err.Error(), http.StatusInternalServerError)
        span.RecordError(err)
        span.SetStatus(codes.Error, err.Error())
        return
    }
    ctx, installationToken, err := getToken(ctx, jwt, accessTokenURL)
    if err != nil {
        http.Error(w, err.Error(), http.StatusInternalServerError)
        span.RecordError(err)
        span.SetStatus(codes.Error, err.Error())
        return
    }
    ctx, err = triggerWorkflow(ctx, installationToken, data)
    if err != nil {
        http.Error(w, err.Error(), http.StatusInternalServerError)
        span.RecordError(err)
        span.SetStatus(codes.Error, err.Error())
        return
    }
}

I am using Honeycomb, why are my traces not being linked here?! Where is the missing root span coming from?

What I have tried:

  • using otelhttp.NewHandler:
    otelHandler := otelhttp.NewHandler(http.HandlerFunc(handleRequest), "handleRequest1")
    http.Handle("/api/trigger-playwright-tests-workflow", otelHandler)

Though from the documentation, NewHandler supports metrics, whereas NewMiddleware supports tracing and metrics

I've also tried extracting the traceparent and creating a SpanContext manually.

1

There are 1 best solutions below

0
SuperSecretAndNotSafeFromWork On

This was not a code issue but a Honeycomb environment problem on my side. My environments were configured like this:

Client programs prod: EnvA

Client programs demo: EnvB

Client programs uat: EnvC

My Azure Function: EnvAzure

The environment needs to match for the linking to occur.

From speaking to Honeycomb, the convention is to have environments separated by PROD/UAT; whereas I had grouped services almost by tech stack + server + purpose