[chore][pkg/stanza] Fixed broken benchmarks (open-telemetry#43190)

gnak-yar · andrzej-stencel · tommyers-elastic · commit 2cecf8bedf4d · 2025-10-10T11:33:56.000+01:00
#### Description This is another fix of open-telemetry#43044. Probably one more PR will be needed! Addressed 2 of the benchmark failures in `pkg/stanza/adapter` here. ``` --- FAIL: BenchmarkReadLine receiver_test.go:285: Error Trace: /Users/ray.kang/github.com/gnak-yar/opentelemetry-collector-contrib/pkg/stanza/adapter/receiver_test.go:285 Error: Received unexpected error: '' unsupported type 'file_input' Test: BenchmarkReadLine --- FAIL: BenchmarkParseAndMap receiver_test.go:369: Error Trace: /Users/ray.kang/github.com/gnak-yar/opentelemetry-collector-contrib/pkg/stanza/adapter/receiver_test.go:369 Error: Received unexpected error: unsupported type 'file_input' Test: BenchmarkParseAndMap ``` **BenchmarkReadLine** The `file_input` operator should be registered to a global operator registry before run. The registration is done when the file input operator package is imported, but the package isn't imported so the error occurs. I've removed the benchmark instead of fixing it. Here's the reason. - As far as I understand, this benchmark is to see the `file_input` operator's performance, but we have other benchmarks doing almost the same thing. - [file.BenchmarkReadExistingLogs](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/7e4bf11279e5b454980626237ccd1417b7e0e26a/pkg/stanza/operator/input/file/benchmark_test.go#L23) - [fileconsumer.BenchmarkFileInput](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/7e4bf11279e5b454980626237ccd1417b7e0e26a/pkg/stanza/fileconsumer/benchmark_test.go#L31) - Even the test seems a bit weird to me because the `b.N` loop(`for i := 0; i < b.N; i++ {`) is used for log generation, not for a function to be measured. **BenchmarkParseAndMap** Two reasons of the failure - The `file_input` package is not imported as well. - Unmarshaling yaml configs(line 369 before change) does not work well, as the configs are not supposed to be loaded from yaml directly. For example, the following structure is a part of `fileconsumer.Config`. `yaml.Unmarshaler` does not understand the embeded fields by default(Probably a more tag like `yaml:",inline"` is required). ```go type Config struct { matcher.Criteria `mapstructure:",squash"` attrs.Resolver `mapstructure:",squash"` ... } ``` As far as I understand, `BenchmarkParseAndMap` is to see the performance of `regex_parser`'s severity mappings. So, I've moved the benchmark to `regex_parser` and made it focus on the operator's performance. I'm not familiar with the full history and background. Please let me know if you have any concerns or suggestions!  #### Link to tracking issue Fixes open-telemetry#43044  #### Testing ``` cd pkg/stanza/adapter go test -bench='^Benchmark(ReadLine|ParseAndMap)$' # For testing a benchmark moved to the regex parser. cd pkg/stanza/operator/parser/regex go test -bench='^BenchmarkProcessBatch' ``` --------- Co-authored-by: Andrzej Stencel <andrzej.stencel@elastic.co>
diff --git a/pkg/stanza/adapter/receiver_test.go b/pkg/stanza/adapter/receiver_test.go
@@ -7,8 +7,6 @@ import (
 	"context"
 	"fmt"
 	"math"
-	"os"
-	"path/filepath"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -17,13 +15,11 @@ import (
 	"github.com/stretchr/testify/require"
 	"go.opentelemetry.io/collector/component"
 	"go.opentelemetry.io/collector/component/componenttest"
-	"go.opentelemetry.io/collector/confmap/confmaptest"
 	"go.opentelemetry.io/collector/consumer"
 	"go.opentelemetry.io/collector/consumer/consumertest"
 	"go.opentelemetry.io/collector/pdata/plog"
 	"go.opentelemetry.io/collector/receiver/receiverhelper"
 	"go.opentelemetry.io/collector/receiver/receivertest"
-	"gopkg.in/yaml.v3"
 
 	"github.com/open-telemetry/opentelemetry-collector-contrib/extension/storage/storagetest"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/consumerretry"
@@ -259,150 +255,6 @@ func benchmarkReceiver(b *testing.B, logsPerIteration int, batchingInput, batchi
 	require.NoError(b, rcv.Shutdown(b.Context()))
 }
 
-func BenchmarkReadLine(b *testing.B) {
-	receivedAllLogs := make(chan struct{})
-	filePath := filepath.Join(b.TempDir(), "bench.log")
-
-	pipelineYaml := fmt.Sprintf(`
-pipeline:
-  type: file_input
-  include:
-    - %s
-  start_at: beginning`,
-		filePath)
-
-	confmapFilePath := filepath.Join(b.TempDir(), "conf.yaml")
-	require.NoError(b, os.WriteFile(confmapFilePath, []byte(pipelineYaml), 0o600))
-
-	testConfMaps, err := confmaptest.LoadConf(confmapFilePath)
-	require.NoError(b, err)
-
-	conf, err := testConfMaps.Sub("pipeline")
-	require.NoError(b, err)
-	require.NotNil(b, conf)
-
-	operatorCfg := operator.Config{}
-	require.NoError(b, conf.Unmarshal(&operatorCfg))
-
-	operatorCfgs := []operator.Config{operatorCfg}
-
-	storageClient := storagetest.NewInMemoryClient(
-		component.KindReceiver,
-		component.MustNewID("foolog"),
-		"test",
-	)
-
-	obsrecv, err := receiverhelper.NewObsReport(receiverhelper.ObsReportSettings{ReceiverCreateSettings: receivertest.NewNopSettings(component.MustNewType("foolog"))})
-	require.NoError(b, err)
-
-	mockConsumer := &testConsumer{
-		receivedAllLogs: receivedAllLogs,
-		expectedLogs:    uint32(b.N),
-		receivedLogs:    atomic.Uint32{},
-	}
-	rcv := &receiver{
-		consumer:      mockConsumer,
-		obsrecv:       obsrecv,
-		storageClient: storageClient,
-	}
-
-	set := componenttest.NewNopTelemetrySettings()
-	emitter := helper.NewBatchingLogEmitter(set, rcv.consumeEntries)
-	defer func() {
-		require.NoError(b, emitter.Stop())
-	}()
-
-	pipe, err := pipeline.Config{
-		Operators:     operatorCfgs,
-		DefaultOutput: emitter,
-	}.Build(set)
-	require.NoError(b, err)
-
-	rcv.pipe = pipe
-	rcv.set = set
-	rcv.emitter = emitter
-
-	// Populate the file that will be consumed
-	file, err := os.OpenFile(filePath, os.O_RDWR|os.O_CREATE, 0o666)
-	require.NoError(b, err)
-	for i := 0; i < b.N; i++ {
-		_, err := file.WriteString("testlog\n")
-		require.NoError(b, err)
-	}
-
-	// Run the actual benchmark
-	b.ResetTimer()
-	require.NoError(b, rcv.Start(b.Context(), nil))
-
-	<-receivedAllLogs
-
-	require.NoError(b, rcv.Shutdown(b.Context()))
-}
-
-func BenchmarkParseAndMap(b *testing.B) {
-	filePath := filepath.Join(b.TempDir(), "bench.log")
-
-	fileInputYaml := fmt.Sprintf(`
-- type: file_input
-  include:
-    - %s
-  start_at: beginning`, filePath)
-
-	regexParserYaml := `
-- type: regex_parser
-  regex: '(?P<remote_host>[^\s]+) - (?P<remote_user>[^\s]+) \[(?P<timestamp>[^\]]+)\] "(?P<http_method>[A-Z]+) (?P<path>[^\s]+)[^"]+" (?P<http_status>\d+) (?P<bytes_sent>[^\s]+)'
-  timestamp:
-    parse_from: timestamp
-    layout: '%d/%b/%Y:%H:%M:%S %z'
-  severity:
-    parse_from: http_status
-    preserve: true
-    mapping:
-      critical: 5xx
-      error: 4xx
-      info: 3xx
-      debug: 2xx`
-
-	pipelineYaml := fmt.Sprintf("%s%s", fileInputYaml, regexParserYaml)
-
-	var operatorCfgs []operator.Config
-	require.NoError(b, yaml.Unmarshal([]byte(pipelineYaml), &operatorCfgs))
-
-	set := componenttest.NewNopTelemetrySettings()
-	emitter := helper.NewBatchingLogEmitter(set, func(_ context.Context, entries []*entry.Entry) {
-		for _, e := range entries {
-			convert(e)
-		}
-	})
-	defer func() {
-		require.NoError(b, emitter.Stop())
-	}()
-
-	pipe, err := pipeline.Config{
-		Operators:     operatorCfgs,
-		DefaultOutput: emitter,
-	}.Build(set)
-	require.NoError(b, err)
-
-	// Populate the file that will be consumed
-	file, err := os.OpenFile(filePath, os.O_RDWR|os.O_CREATE, 0o666)
-	require.NoError(b, err)
-	for i := 0; i < b.N; i++ {
-		_, err := fmt.Fprintf(file, "10.33.121.119 - - [11/Aug/2020:00:00:00 -0400] \"GET /index.html HTTP/1.1\" 404 %d\n", i%1000)
-		require.NoError(b, err)
-	}
-
-	storageClient := storagetest.NewInMemoryClient(
-		component.KindReceiver,
-		component.MustNewID("foolog"),
-		"test",
-	)
-
-	// Run the actual benchmark
-	b.ResetTimer()
-	require.NoError(b, pipe.Start(storageClient))
-}
-
 const testInputOperatorTypeStr = "test_input"
 
 type testInputBuilder struct {
diff --git a/pkg/stanza/operator/parser/regex/benchmark_test.go b/pkg/stanza/operator/parser/regex/benchmark_test.go
@@ -0,0 +1,50 @@
+// Copyright The OpenTelemetry Authors
+// SPDX-License-Identifier: Apache-2.0
+
+package regex
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+	"go.opentelemetry.io/collector/component/componenttest"
+
+	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/entry"
+	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator/helper"
+)
+
+func BenchmarkProcessBatch(b *testing.B) {
+	b.Run("SeverityMapping", func(b *testing.B) {
+		config := NewConfig()
+		config.OnError = helper.SendOnError
+		config.Regex = `(?P<remote_host>[^\s]+) - (?P<remote_user>[^\s]+) \[(?P<timestamp>[^\]]+)\] "(?P<http_method>[A-Z]+) (?P<path>[^\s]+) [^"]+" (?P<http_status>\d+) (?P<bytes_sent>[^\s]+)`
+		config.TimeParser = &helper.TimeParser{
+			ParseFrom:  func() *entry.Field { f := entry.NewAttributeField("timestamp"); return &f }(),
+			Layout:     "%d/%b/%Y:%H:%M:%S %z",
+			LayoutType: helper.StrptimeKey,
+		}
+		config.SeverityConfig = &helper.SeverityConfig{
+			ParseFrom: func() *entry.Field { f := entry.NewAttributeField("http_status"); return &f }(),
+			Mapping: map[string]any{
+				"critical": "5xx",
+				"error":    "4xx",
+				"info":     "3xx",
+				"debug":    "2xx",
+			},
+		}
+
+		op, err := config.Build(componenttest.NewNopTelemetrySettings())
+		require.NoError(b, err)
+
+		entries := make([]*entry.Entry, 1000000)
+		for i := range 1000 {
+			entries[i] = entry.New()
+			entries[i].Body = fmt.Sprintf("10.33.121.119 - - [11/Aug/2020:00:00:00 -0400] \"GET /index.html HTTP/1.1\" 404 %d\n", i%1000)
+		}
+
+		for b.Loop() {
+			require.NoError(b, op.ProcessBatch(b.Context(), entries))
+		}
+	})
+}