diff --git a/.gitignore b/.gitignore index 3299f24..749111e 100644 --- a/.gitignore +++ b/.gitignore @@ -31,5 +31,8 @@ go.work.sum .idea/ # .vscode/ +# Claude Code local state +.claude/ + /bagel dist/ diff --git a/CLA_SIGNATURES.md b/CLA_SIGNATURES.md index 666a139..9d9a7f2 100644 --- a/CLA_SIGNATURES.md +++ b/CLA_SIGNATURES.md @@ -2,4 +2,5 @@ SUSTAPLE117 - Alexis-Maurer Fortin fproulx-boostsecurity - François Proulx Talgarr - Sebastien Graveline julien-boost - Julien Champoux -GuillaumeRoss - Guillaume Ross \ No newline at end of file +GuillaumeRoss - Guillaume Ross +c0tton-fluff - Michal Ambrozkiewicz \ No newline at end of file diff --git a/README.md b/README.md index 1bceceb..ee87da7 100644 --- a/README.md +++ b/README.md @@ -280,6 +280,55 @@ All probes work cross-platform with appropriate path handling for each OS. --- +## Scrub Command + +> **Fork addition** -- not in upstream Bagel. + +`bagel scrub` removes credentials from AI CLI session logs and shell history files, replacing them with `[REDACTED-]` markers while preserving conversation context. + +```bash +# Scan and interactively confirm (default) +bagel scrub + +# Skip prompt, apply immediately +bagel scrub --yes + +# Scan only, no modifications +bagel scrub --dry-run + +# Scrub without grace period (includes recent files) +bagel scrub --yes --grace-minutes 0 + +# Scrub a single file +bagel scrub --yes --file ~/.claude/projects/foo/abc123.jsonl +``` + +| Flag | Default | Description | +|------|---------|-------------| +| `--yes` / `-y` | `false` | Skip confirmation prompt and apply changes | +| `--dry-run` | `false` | Scan and report only, do not modify files | +| `--grace-minutes` | `60` | Skip files modified within this many minutes | +| `--file` | | Scrub a single file instead of all eligible files | + +**Targets:** +- `~/.claude/projects/**/*.jsonl` -- Claude Code session logs +- `~/.claude/projects/**/*.txt` -- Claude Code tool results +- `~/.codex/sessions/**/*.jsonl` -- Codex CLI session logs +- `~/.gemini/tmp/*/chats/*.json` -- Gemini CLI chat logs +- `~/.local/share/opencode/**/*.json` -- OpenCode session logs +- `~/.bash_history` -- Bash shell history +- `~/.zsh_history` -- Zsh shell history +- `~/.sh_history` -- Generic shell history +- `~/.local/share/fish/fish_history` -- Fish shell history + +**Recommended workflow:** +1. `bagel scan -f table` -- assess your exposure +2. `bagel scrub --yes` -- clean up +3. `bagel scan -f table` -- verify reduction +4. Rotate any credentials that were found + +--- + ## Integrations * **CI**: run `bagel scan --strict` in your pipeline to fail builds when findings are detected. diff --git a/cmd/bagel/scrub.go b/cmd/bagel/scrub.go new file mode 100644 index 0000000..b777284 --- /dev/null +++ b/cmd/bagel/scrub.go @@ -0,0 +1,304 @@ +// Copyright (C) 2026 boostsecurity.io +// SPDX-License-Identifier: GPL-3.0-or-later + +package main + +import ( + "bufio" + "fmt" + "os" + "sort" + "strings" + "time" + + "github.com/boostsecurityio/bagel/pkg/collector" + "github.com/boostsecurityio/bagel/pkg/config" + "github.com/boostsecurityio/bagel/pkg/detector" + "github.com/boostsecurityio/bagel/pkg/models" + "github.com/boostsecurityio/bagel/pkg/probe" + "github.com/boostsecurityio/bagel/pkg/scrubber" + "github.com/mattn/go-isatty" + "github.com/rs/zerolog" + "github.com/spf13/cobra" +) + +var ( + scrubYes bool + scrubDryRun bool + scrubGraceMinutes int + scrubFile string +) + +// scrubCmd represents the scrub command +var scrubCmd = &cobra.Command{ + Use: "scrub", + Short: "Remove credentials from AI CLI session logs and shell history", + Long: `Scrub replaces credential patterns in AI CLI session logs and shell +history files with [REDACTED-] markers. Preserves all context -- +only secrets become useless. + +By default shows what would be changed and asks for confirmation. +Use --yes to skip the prompt, or --dry-run to only report. + +Targets: + ~/.claude/projects/**/*.jsonl Claude Code session logs + ~/.codex/sessions/**/*.jsonl Codex CLI session logs + ~/.gemini/tmp/*/chats/*.json Gemini CLI chat logs + ~/.local/share/opencode/**/*.json OpenCode session logs + ~/.bash_history Bash shell history + ~/.zsh_history Zsh shell history + ~/.sh_history Generic shell history + ~/.local/share/fish/fish_history Fish shell history`, + RunE: runScrub, +} + +func init() { + rootCmd.AddCommand(scrubCmd) + + scrubCmd.Flags().BoolVarP( + &scrubYes, "yes", "y", false, + "skip confirmation prompt and apply changes") + scrubCmd.Flags().BoolVar( + &scrubDryRun, "dry-run", false, + "scan and report only, do not modify files") + scrubCmd.Flags().IntVar( + &scrubGraceMinutes, "grace-minutes", 60, + "skip files modified within this many minutes") + scrubCmd.Flags().StringVar( + &scrubFile, "file", "", + "scrub a single file instead of all eligible files") +} + +const scopeWarning = `NOTE: bagel scrub redacts credentials found in session logs and shell +history files. It does NOT rotate or revoke exposed credentials. +Credentials that appeared in these files may already be compromised. + +For findings requiring manual action (key rotation, re-encryption), +run 'bagel scan' and follow the remediation guidance. +` + +// newScrubRegistry builds a detector registry configured for redaction. +// Registration order matters: specific patterns before general ones. +func newScrubRegistry() *detector.Registry { + registry := detector.NewRegistry() + registry.Register(detector.NewSSHPrivateKeyDetector()) + registry.Register(detector.NewHTTPAuthDetector()) + registry.Register(detector.NewAIServiceDetector()) + registry.Register(detector.NewCloudCredentialsDetector()) + registry.Register(detector.NewSplunkTokenDetector()) + registry.Register(detector.NewGitHubPATDetector()) + registry.Register(detector.NewNPMTokenDetector()) + registry.Register(detector.NewJWTDetector()) + registry.Register(detector.NewGenericAPIKeyDetector()) + return registry +} + +func runScrub(cmd *cobra.Command, _ []string) error { + ctx := cmd.Context() + log := zerolog.Ctx(ctx) + + registry := newScrubRegistry() + + // Resolve target files + files, err := resolveScrubFiles(cmd, registry) + if err != nil { + return err + } + + // Phase 1: Preview + previewResult, err := scrubber.Preview(ctx, scrubber.PreviewInput{ + Files: files, + Registry: registry, + }) + if err != nil { + return fmt.Errorf("scrub preview: %w", err) + } + + fmt.Print("\n" + scopeWarning + "\n") + printPreviewSummary(previewResult) + + if previewResult.Redactions == 0 { + fmt.Println("Nothing to scrub.") + return nil + } + + // Phase 2: Decide whether to apply + if scrubDryRun { + fmt.Println("[DRY RUN] No files were modified.") + return nil + } + + if !scrubYes { + if !isInteractive() { + fmt.Println("Non-interactive terminal detected. Use --yes to apply, or --dry-run to scan only.") + return nil + } + if !promptConfirm() { + fmt.Println("Aborted.") + return nil + } + } + + // Phase 3: Apply + applyResult, err := scrubber.Apply(ctx, scrubber.ApplyInput{ + Files: previewResult.Files, + Registry: registry, + }) + if err != nil { + return fmt.Errorf("scrub apply: %w", err) + } + + log.Info(). + Int("files_modified", applyResult.FilesModified). + Int("redactions", applyResult.Redactions). + Msg("Scrub complete") + + fmt.Printf("\nScrub applied:\n") + fmt.Printf(" Files modified: %d\n", applyResult.FilesModified) + fmt.Printf(" Redactions: %d\n", applyResult.Redactions) + printCountsByType(applyResult.CountsByType) + + return nil +} + +// resolveScrubFiles determines which files to scrub. When --file is set, it +// targets that single file. Otherwise it runs the scan pipeline (FileIndex + +// probes) and extracts file paths from findings. +func resolveScrubFiles(cmd *cobra.Command, registry *detector.Registry) ([]string, error) { + if scrubFile != "" { + if _, err := os.Stat(scrubFile); err != nil { + return nil, fmt.Errorf("file not found: %s", scrubFile) + } + return []string{scrubFile}, nil + } + + ctx := cmd.Context() + + cfg, err := config.Load(cfgFile) + if err != nil { + return nil, fmt.Errorf("load config: %w", err) + } + + probes := []probe.Probe{ + probe.NewAICliProbe(cfg.Probes.AICli, registry), + probe.NewShellHistoryProbe(cfg.Probes.ShellHistory, registry), + } + + col := collector.New(collector.NewInput{ + Probes: probes, + Config: cfg, + NoCache: true, + NoProgress: true, + }) + + result, err := col.Collect(ctx) + if err != nil { + return nil, fmt.Errorf("scan: %w", err) + } + + files := uniqueFilePaths(result.Findings) + files = filterByGracePeriod(files, scrubGraceMinutes) + + return files, nil +} + +// uniqueFilePaths extracts deduplicated file paths from findings. +// Finding.Path uses the format "file:/path/to/file" (sometimes with +// ":lineNum" appended by FormatSource); we strip the "file:" prefix. +func uniqueFilePaths(findings []models.Finding) []string { + seen := make(map[string]struct{}, len(findings)) + paths := make([]string, 0, len(findings)) + + for _, f := range findings { + p := f.Path + p, _ = strings.CutPrefix(p, "file:") + + // Strip trailing ":lineNum" if present (e.g. "file:/path:42") + if idx := strings.LastIndex(p, ":"); idx > 0 { + candidate := p[:idx] + if _, err := os.Stat(candidate); err == nil { + p = candidate + } + } + + if _, ok := seen[p]; ok { + continue + } + seen[p] = struct{}{} + paths = append(paths, p) + } + + return paths +} + +// filterByGracePeriod removes files modified within the last graceMins +// minutes. Files with unreadable metadata are silently skipped. +func filterByGracePeriod(files []string, graceMins int) []string { + if graceMins <= 0 { + return files + } + + cutoff := time.Now().Add(-time.Duration(graceMins) * time.Minute) + filtered := make([]string, 0, len(files)) + + for _, path := range files { + info, err := os.Stat(path) + if err != nil { + continue + } + if info.ModTime().Before(cutoff) { + filtered = append(filtered, path) + } + } + + return filtered +} + +func printPreviewSummary(r scrubber.PreviewResult) { + fmt.Printf("Scan results:\n") + fmt.Printf(" Files scanned: %d\n", r.FilesScanned) + fmt.Printf(" Files with secrets: %d\n", len(r.Files)) + fmt.Printf(" Total redactions: %d\n", r.Redactions) + printCountsByType(r.CountsByType) + if len(r.Files) > 0 { + fmt.Printf(" Files:\n") + for _, f := range r.Files { + fmt.Printf(" %s\n", f) + } + } + fmt.Println() +} + +func printCountsByType(counts map[string]int) { + if len(counts) == 0 { + return + } + fmt.Println(" By type:") + for _, k := range sortedKeys(counts) { + fmt.Printf(" %s: %d\n", k, counts[k]) + } +} + +func isInteractive() bool { + return isatty.IsTerminal(os.Stdin.Fd()) || + isatty.IsCygwinTerminal(os.Stdin.Fd()) +} + +func promptConfirm() bool { + fmt.Print("Proceed with scrubbing? [y/N] ") + scanner := bufio.NewScanner(os.Stdin) + if !scanner.Scan() { + return false + } + answer := strings.TrimSpace(strings.ToLower(scanner.Text())) + return answer == "y" || answer == "yes" +} + +func sortedKeys(m map[string]int) []string { + keys := make([]string, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} diff --git a/cmd/bagel/scrub_test.go b/cmd/bagel/scrub_test.go new file mode 100644 index 0000000..56ca127 --- /dev/null +++ b/cmd/bagel/scrub_test.go @@ -0,0 +1,114 @@ +// Copyright (C) 2026 boostsecurity.io +// SPDX-License-Identifier: GPL-3.0-or-later + +package main + +import ( + "os" + "path/filepath" + "testing" + "time" + + "github.com/boostsecurityio/bagel/pkg/models" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestUniqueFilePaths(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + findings []models.Finding + want []string + }{ + { + name: "empty findings", + findings: nil, + want: []string{}, + }, + { + name: "single file", + findings: []models.Finding{ + {Path: "file:/home/user/.bash_history"}, + }, + want: []string{"/home/user/.bash_history"}, + }, + { + name: "deduplicates same file", + findings: []models.Finding{ + {Path: "file:/home/user/.bash_history"}, + {Path: "file:/home/user/.bash_history"}, + }, + want: []string{"/home/user/.bash_history"}, + }, + { + name: "multiple distinct files", + findings: []models.Finding{ + {Path: "file:/home/user/.bash_history"}, + {Path: "file:/home/user/.zsh_history"}, + }, + want: []string{ + "/home/user/.bash_history", + "/home/user/.zsh_history", + }, + }, + { + name: "path without file prefix", + findings: []models.Finding{ + {Path: "/home/user/.bash_history"}, + }, + want: []string{"/home/user/.bash_history"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got := uniqueFilePaths(tt.findings) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestFilterByGracePeriod(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + + // Create an old file (2 hours ago) + oldFile := filepath.Join(dir, "old.jsonl") + require.NoError(t, os.WriteFile(oldFile, []byte("data"), 0600)) + oldTime := time.Now().Add(-2 * time.Hour) + require.NoError(t, os.Chtimes(oldFile, oldTime, oldTime)) + + // Create a recent file (just now) + newFile := filepath.Join(dir, "new.jsonl") + require.NoError(t, os.WriteFile(newFile, []byte("data"), 0600)) + + t.Run("filters recent files", func(t *testing.T) { + t.Parallel() + result := filterByGracePeriod([]string{oldFile, newFile}, 60) + assert.Contains(t, result, oldFile) + assert.NotContains(t, result, newFile) + }) + + t.Run("grace zero includes all", func(t *testing.T) { + t.Parallel() + result := filterByGracePeriod([]string{oldFile, newFile}, 0) + assert.Contains(t, result, oldFile) + assert.Contains(t, result, newFile) + }) + + t.Run("empty input", func(t *testing.T) { + t.Parallel() + result := filterByGracePeriod(nil, 60) + assert.Empty(t, result) + }) + + t.Run("nonexistent file skipped", func(t *testing.T) { + t.Parallel() + result := filterByGracePeriod([]string{"/nonexistent/path"}, 60) + assert.Empty(t, result) + }) +} diff --git a/pkg/config/config.go b/pkg/config/config.go index 127ed4e..ecb90b3 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -157,6 +157,7 @@ func setDefaults(v *viper.Viper) { ".zsh_history", ".sh_history", ".history", + ".local/share/fish/fish_history", // PowerShell history (Windows) "AppData/Roaming/Microsoft/Windows/PowerShell/PSReadLine/ConsoleHost_history.txt", }, "type": "glob"}, diff --git a/pkg/detector/ai_service.go b/pkg/detector/ai_service.go index fe8dda6..58c0ef3 100644 --- a/pkg/detector/ai_service.go +++ b/pkg/detector/ai_service.go @@ -12,12 +12,34 @@ import ( // AIServiceDetector detects API keys for various AI services type AIServiceDetector struct { - tokenPatterns map[string]*tokenPattern + tokenPatterns map[string]*tokenPattern + redactPatterns []RedactPattern } // NewAIServiceDetector creates a new AI service API key detector func NewAIServiceDetector() *AIServiceDetector { return &AIServiceDetector{ + // Redaction patterns: specific before general (Anthropic before generic sk-) + redactPatterns: []RedactPattern{ + { + Regex: regexp.MustCompile(`sk-ant-[A-Za-z0-9_-]{20,}`), + Replacement: `[REDACTED-anthropic-key]`, + Label: "REDACTED-anthropic-key", + Prefixes: []string{"sk-ant-"}, + }, + { + Regex: regexp.MustCompile(`sk-proj-[A-Za-z0-9_-]{20,}`), + Replacement: `[REDACTED-openai-key]`, + Label: "REDACTED-openai-key", + Prefixes: []string{"sk-proj-"}, + }, + { + Regex: regexp.MustCompile(`sk-[A-Za-z0-9]{40,}`), + Replacement: `[REDACTED-openai-key]`, + Label: "REDACTED-openai-key", + Prefixes: []string{"sk-"}, + }, + }, tokenPatterns: map[string]*tokenPattern{ "openai": { // Put hyphen at end of character class to avoid escaping issues @@ -74,6 +96,11 @@ func (d *AIServiceDetector) Detect(content string, ctx *models.DetectionContext) return findings } +// Redact replaces AI service API keys in content with redaction markers. +func (d *AIServiceDetector) Redact(content string) (string, map[string]int) { + return ApplyRedactPatterns(content, d.redactPatterns) +} + // createFinding creates a finding for a detected AI service API key func (d *AIServiceDetector) createFinding(token string, pattern *tokenPattern, ctx *models.DetectionContext) models.Finding { return models.Finding{ diff --git a/pkg/detector/cloud_credentials.go b/pkg/detector/cloud_credentials.go index 7c3b7e9..d4ad705 100644 --- a/pkg/detector/cloud_credentials.go +++ b/pkg/detector/cloud_credentials.go @@ -13,11 +13,59 @@ import ( // CloudCredentialsDetector detects cloud provider credentials (AWS, GCP, Azure) type CloudCredentialsDetector struct { credentialPatterns []*tokenPattern + redactPatterns []RedactPattern } // NewCloudCredentialsDetector creates a new cloud credentials detector func NewCloudCredentialsDetector() *CloudCredentialsDetector { return &CloudCredentialsDetector{ + redactPatterns: []RedactPattern{ + { + Regex: regexp.MustCompile(`AKIA[0-9A-Z]{16}`), + Replacement: `[REDACTED-aws-access-key]`, + Label: "REDACTED-aws-access-key", + Prefixes: []string{"AKIA"}, + }, + { + Regex: regexp.MustCompile(`ASIA[0-9A-Z]{16}`), + Replacement: `[REDACTED-aws-sts-key]`, + Label: "REDACTED-aws-sts-key", + Prefixes: []string{"ASIA"}, + }, + { + Regex: regexp.MustCompile( + `((?:aws_session_token|AWS_SESSION_TOKEN|SessionToken)["\s:=]+)[A-Za-z0-9+/=]{100,}`), + Replacement: `${1}[REDACTED-aws-session-token]`, + Label: "REDACTED-aws-session-token", + Prefixes: []string{"aws_session_token", "AWS_SESSION_TOKEN", "SessionToken"}, + }, + { + Regex: regexp.MustCompile(`IQoJb3JpZ2lu[A-Za-z0-9+/=]{100,}`), + Replacement: `[REDACTED-aws-session-token]`, + Label: "REDACTED-aws-session-token", + Prefixes: []string{"IQoJb3JpZ2lu"}, + }, + { + Regex: regexp.MustCompile( + `((?:aws_secret_access_key|secret_access_key|SecretAccessKey)["\s:=]+)[A-Za-z0-9+/]{40}`), + Replacement: `${1}[REDACTED-aws-secret-key]`, + Label: "REDACTED-aws-secret-key", + Prefixes: []string{"aws_secret_access_key", "secret_access_key", "SecretAccessKey"}, + }, + { + Regex: regexp.MustCompile( + `((?:AccountKey|storage_key|StorageKey)["\s:=]+)[A-Za-z0-9+/]{86}==`), + Replacement: `${1}[REDACTED-azure-storage-key]`, + Label: "REDACTED-azure-storage-key", + Prefixes: []string{"AccountKey", "storage_key", "StorageKey"}, + }, + { + Regex: regexp.MustCompile(`AIza[A-Za-z0-9_-]{35}`), + Replacement: `[REDACTED-gcp-api-key]`, + Label: "REDACTED-gcp-api-key", + Prefixes: []string{"AIza"}, + }, + }, // Patterns are checked in order - more specific patterns should come first credentialPatterns: []*tokenPattern{ // Azure Credentials (check first - most specific due to length) @@ -45,6 +93,27 @@ func NewCloudCredentialsDetector() *CloudCredentialsDetector { tokenType: "gcp-api-key", description: "Google Cloud API Key", }, + + // AWS Session Token (labeled) + { + regex: regexp.MustCompile(`(?:aws_session_token|AWS_SESSION_TOKEN|SessionToken)["\s:=]+([A-Za-z0-9+/=]{100,})`), + tokenType: "aws-session-token", + description: "AWS Session Token", + }, + + // AWS STS Session Token (label-free, base64 prefix) + { + regex: regexp.MustCompile(`\b(IQoJb3JpZ2lu[A-Za-z0-9+/=]{100,})\b`), + tokenType: "aws-sts-session-token", + description: "AWS STS Session Token", + }, + + // AWS Secret Access Key (labeled) + { + regex: regexp.MustCompile(`(?:aws_secret_access_key|secret_access_key|SecretAccessKey)["\s:=]+([A-Za-z0-9+/]{40})`), + tokenType: "aws-secret-access-key", + description: "AWS Secret Access Key", + }, }, } } @@ -82,6 +151,11 @@ func (d *CloudCredentialsDetector) Detect(content string, ctx *models.DetectionC return findings } +// Redact replaces cloud credentials in content with redaction markers. +func (d *CloudCredentialsDetector) Redact(content string) (string, map[string]int) { + return ApplyRedactPatterns(content, d.redactPatterns) +} + // createFinding creates a finding for detected cloud credentials func (d *CloudCredentialsDetector) createFinding(credential string, pattern *tokenPattern, ctx *models.DetectionContext) models.Finding { // All cloud credentials are critical severity (we're only detecting actual secrets now) diff --git a/pkg/detector/detector.go b/pkg/detector/detector.go index 66df691..c6104d4 100644 --- a/pkg/detector/detector.go +++ b/pkg/detector/detector.go @@ -6,6 +6,8 @@ package detector import ( "crypto/sha256" "encoding/hex" + "regexp" + "strings" "github.com/boostsecurityio/bagel/pkg/models" ) @@ -27,6 +29,20 @@ type Detector interface { Detect(content string, ctx *models.DetectionContext) []models.Finding } +// Redactor is optionally implemented by detectors that support +// content redaction (find-and-replace of secrets). +type Redactor interface { + Redact(content string) (string, map[string]int) +} + +// RedactPattern holds a compiled regex for content redaction. +type RedactPattern struct { + Regex *regexp.Regexp + Replacement string + Label string + Prefixes []string +} + // Registry manages all registered detectors type Registry struct { detectors []Detector @@ -47,7 +63,7 @@ func (r *Registry) Register(d Detector) { // DetectAll runs all registered detectors against the content // The context parameter provides probe-specific metadata that gets included in findings func (r *Registry) DetectAll(content string, ctx *models.DetectionContext) []models.Finding { - var findings []models.Finding + findings := make([]models.Finding, 0, len(r.detectors)) for _, det := range r.detectors { detectorFindings := det.Detect(content, ctx) @@ -84,3 +100,50 @@ func (r *Registry) DetectAll(content string, ctx *models.DetectionContext) []mod func (r *Registry) GetDetectors() []Detector { return r.detectors } + +// RedactAll runs all registered detectors that implement Redactor. +// Detectors are applied in registration order. +func (r *Registry) RedactAll(content string) (string, map[string]int) { + counts := make(map[string]int) + for _, det := range r.detectors { + red, ok := det.(Redactor) + if !ok { + continue + } + var detCounts map[string]int + content, detCounts = red.Redact(content) + for k, v := range detCounts { + counts[k] += v + } + } + return content, counts +} + +// ApplyRedactPatterns applies redaction patterns to content, returning +// the redacted text and a map of label to match count. +func ApplyRedactPatterns( + content string, + patterns []RedactPattern, +) (string, map[string]int) { + counts := make(map[string]int) + for _, p := range patterns { + if !containsAny(content, p.Prefixes) { + continue + } + matches := p.Regex.FindAllString(content, -1) + if len(matches) > 0 { + counts[p.Label] += len(matches) + content = p.Regex.ReplaceAllString(content, p.Replacement) + } + } + return content, counts +} + +func containsAny(content string, prefixes []string) bool { + for _, p := range prefixes { + if strings.Contains(content, p) { + return true + } + } + return false +} diff --git a/pkg/detector/generic_api_key.go b/pkg/detector/generic_api_key.go index 28a605e..75adce0 100644 --- a/pkg/detector/generic_api_key.go +++ b/pkg/detector/generic_api_key.go @@ -17,6 +17,7 @@ type GenericAPIKeyDetector struct { regex *regexp.Regexp minEntropy float64 excludePatterns []*regexp.Regexp + redactPatterns []RedactPattern } // NewGenericAPIKeyDetector creates a new generic API key detector @@ -24,8 +25,9 @@ func NewGenericAPIKeyDetector() *GenericAPIKeyDetector { pattern := `(?i)[\w.-]{0,50}?(?:access|auth|(?-i:[Aa]pi|API)|credential|creds|key|passw(?:or)?d|secret|token)(?:[ \t\w.-]{0,20})[\s'"]{0,3}(?:=|>|:{1,3}=|\|\||:|=>|\?=|,)[\x60'"\s=]{0,5}([\w.=-]{10,150}|[a-z0-9][a-z0-9+/]{11,}={0,3})(?:[\x60'"\s;]|\\[nr]|$)` return &GenericAPIKeyDetector{ - regex: regexp.MustCompile(pattern), - minEntropy: 3.5, + regex: regexp.MustCompile(pattern), + minEntropy: 3.5, + redactPatterns: nil, // Header-based API key redaction is handled by HTTPAuthDetector excludePatterns: []*regexp.Regexp{ // Exclude common placeholders and examples regexp.MustCompile(`(?i)^(your|my|the|example|sample|test|demo|placeholder|change|replace|insert|put)[-_]`), @@ -119,6 +121,11 @@ func (d *GenericAPIKeyDetector) calculateEntropy(s string) float64 { return entropy } +// Redact replaces generic API keys in content with redaction markers. +func (d *GenericAPIKeyDetector) Redact(content string) (string, map[string]int) { + return ApplyRedactPatterns(content, d.redactPatterns) +} + // createFinding creates a finding for a detected generic API key func (d *GenericAPIKeyDetector) createFinding(secret string, entropy float64, ctx *models.DetectionContext) models.Finding { return models.Finding{ diff --git a/pkg/detector/github_pat.go b/pkg/detector/github_pat.go index cc6f190..0b3547f 100644 --- a/pkg/detector/github_pat.go +++ b/pkg/detector/github_pat.go @@ -12,7 +12,8 @@ import ( // GitHubTokenDetector detects various GitHub token types type GitHubTokenDetector struct { - tokenPatterns map[string]*tokenPattern + tokenPatterns map[string]*tokenPattern + redactPatterns []RedactPattern } type tokenPattern struct { @@ -56,6 +57,38 @@ func NewGitHubPATDetector() *GitHubTokenDetector { description: "GitHub Refresh Token", }, }, + redactPatterns: []RedactPattern{ + { + Regex: regexp.MustCompile(`ghp_[A-Za-z0-9_]{36,}`), + Replacement: `[REDACTED-github-pat]`, + Label: "REDACTED-github-pat", + Prefixes: []string{"ghp_"}, + }, + { + Regex: regexp.MustCompile(`gho_[A-Za-z0-9_]{36,}`), + Replacement: `[REDACTED-github-oauth]`, + Label: "REDACTED-github-oauth", + Prefixes: []string{"gho_"}, + }, + { + Regex: regexp.MustCompile(`ghu_[A-Za-z0-9_]{36,}`), + Replacement: `[REDACTED-github-user]`, + Label: "REDACTED-github-user", + Prefixes: []string{"ghu_"}, + }, + { + Regex: regexp.MustCompile(`ghs_[A-Za-z0-9_]{36,}`), + Replacement: `[REDACTED-github-app]`, + Label: "REDACTED-github-app", + Prefixes: []string{"ghs_"}, + }, + { + Regex: regexp.MustCompile(`github_pat_[A-Za-z0-9_]{22,}`), + Replacement: `[REDACTED-github-fine-pat]`, + Label: "REDACTED-github-fine-pat", + Prefixes: []string{"github_pat_"}, + }, + }, } } @@ -79,6 +112,11 @@ func (d *GitHubTokenDetector) Detect(content string, ctx *models.DetectionContex return findings } +// Redact replaces GitHub tokens in content with redaction markers. +func (d *GitHubTokenDetector) Redact(content string) (string, map[string]int) { + return ApplyRedactPatterns(content, d.redactPatterns) +} + // createFinding creates a finding for a detected GitHub token func (d *GitHubTokenDetector) createFinding(token string, pattern *tokenPattern, ctx *models.DetectionContext) models.Finding { return models.Finding{ diff --git a/pkg/detector/http_auth.go b/pkg/detector/http_auth.go index 27af0ae..c1d5238 100644 --- a/pkg/detector/http_auth.go +++ b/pkg/detector/http_auth.go @@ -12,12 +12,50 @@ import ( // HTTPAuthDetector detects HTTP authentication credentials in various contexts type HTTPAuthDetector struct { - authPatterns map[string]*tokenPattern + authPatterns map[string]*tokenPattern + redactPatterns []RedactPattern } // NewHTTPAuthDetector creates a new HTTP authentication detector func NewHTTPAuthDetector() *HTTPAuthDetector { return &HTTPAuthDetector{ + // Redaction patterns: Bearer+JWT before Bearer+generic, URL auth + redactPatterns: []RedactPattern{ + { + Regex: regexp.MustCompile( + `Bearer\s+eyJ[A-Za-z0-9_-]{10,}\.eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}`), + Replacement: `Bearer [REDACTED-jwt]`, + Label: "REDACTED-jwt", + Prefixes: []string{"Bearer"}, + }, + { + Regex: regexp.MustCompile( + `Bearer\s+[A-Za-z0-9_.\-/+=]{20,}`), + Replacement: `Bearer [REDACTED-bearer-token]`, + Label: "REDACTED-bearer-token", + Prefixes: []string{"Bearer"}, + }, + { + Regex: regexp.MustCompile( + `Basic\s+[A-Za-z0-9+/=]{20,}`), + Replacement: `Basic [REDACTED-basic-auth]`, + Label: "REDACTED-basic-auth", + Prefixes: []string{"Basic"}, + }, + { + Regex: regexp.MustCompile(`(https?://)[^:"\s\\]+:[^@"\s\\]+(@)`), + Replacement: `${1}[REDACTED-basic-auth]${2}`, + Label: "REDACTED-basic-auth", + Prefixes: []string{"://"}, + }, + { + Regex: regexp.MustCompile( + `(?:X-API-Key|x-api-key|Authorization)[":\s]+[A-Za-z0-9_.\-/+=]{30,}`), + Replacement: `[REDACTED-api-key-header]`, + Label: "REDACTED-api-key-header", + Prefixes: []string{"X-API-Key", "x-api-key", "Authorization"}, + }, + }, authPatterns: map[string]*tokenPattern{ "bearer-token": { // Matches: Authorization: Bearer @@ -73,6 +111,11 @@ func (d *HTTPAuthDetector) Detect(content string, ctx *models.DetectionContext) return findings } +// Redact replaces HTTP auth credentials in content with redaction markers. +func (d *HTTPAuthDetector) Redact(content string) (string, map[string]int) { + return ApplyRedactPatterns(content, d.redactPatterns) +} + // createFinding creates a finding for detected HTTP authentication credentials func (d *HTTPAuthDetector) createFinding(credential string, pattern *tokenPattern, ctx *models.DetectionContext) models.Finding { return models.Finding{ diff --git a/pkg/detector/jwt.go b/pkg/detector/jwt.go index b64ba09..882e542 100644 --- a/pkg/detector/jwt.go +++ b/pkg/detector/jwt.go @@ -12,12 +12,23 @@ import ( // JWTDetector detects JWT tokens in various contexts type JWTDetector struct { - tokenPatterns map[string]*tokenPattern + tokenPatterns map[string]*tokenPattern + redactPatterns []RedactPattern } // NewJWTDetector creates a new JWT detector func NewJWTDetector() *JWTDetector { return &JWTDetector{ + // Standalone JWT redaction (after Bearer patterns handled by HTTPAuthDetector) + redactPatterns: []RedactPattern{ + { + Regex: regexp.MustCompile( + `eyJ[A-Za-z0-9_-]{10,}\.eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}`), + Replacement: `[REDACTED-jwt]`, + Label: "REDACTED-jwt", + Prefixes: []string{"eyJ"}, + }, + }, tokenPatterns: map[string]*tokenPattern{ "jwt-token": { // Matches: .. @@ -63,6 +74,11 @@ func (d *JWTDetector) Detect(content string, ctx *models.DetectionContext) []mod return findings } +// Redact replaces standalone JWT tokens in content with redaction markers. +func (d *JWTDetector) Redact(content string) (string, map[string]int) { + return ApplyRedactPatterns(content, d.redactPatterns) +} + // createFinding creates a finding for detected JWT tokens func (d *JWTDetector) createFinding(credential string, pattern *tokenPattern, ctx *models.DetectionContext) models.Finding { return models.Finding{ diff --git a/pkg/detector/npm_token.go b/pkg/detector/npm_token.go index eb2adaf..f919da7 100644 --- a/pkg/detector/npm_token.go +++ b/pkg/detector/npm_token.go @@ -12,7 +12,8 @@ import ( // NPMTokenDetector detects various NPM and Yarn authentication tokens type NPMTokenDetector struct { - tokenPatterns map[string]*tokenPattern + tokenPatterns map[string]*tokenPattern + redactPatterns []RedactPattern } // NewNPMTokenDetector creates a new NPM token detector @@ -25,6 +26,14 @@ func NewNPMTokenDetector() *NPMTokenDetector { description: "NPM Authentication Token", }, }, + redactPatterns: []RedactPattern{ + { + Regex: regexp.MustCompile(`npm_[A-Za-z0-9]{36,}`), + Replacement: `[REDACTED-npm-token]`, + Label: "REDACTED-npm-token", + Prefixes: []string{"npm_"}, + }, + }, } } @@ -60,6 +69,11 @@ func (d *NPMTokenDetector) Detect(content string, ctx *models.DetectionContext) return findings } +// Redact replaces NPM tokens in content with redaction markers. +func (d *NPMTokenDetector) Redact(content string) (string, map[string]int) { + return ApplyRedactPatterns(content, d.redactPatterns) +} + // createFinding creates a finding for a detected NPM/Yarn token func (d *NPMTokenDetector) createFinding(token string, pattern *tokenPattern, ctx *models.DetectionContext) models.Finding { return models.Finding{ diff --git a/pkg/detector/redact_test.go b/pkg/detector/redact_test.go new file mode 100644 index 0000000..1129ab3 --- /dev/null +++ b/pkg/detector/redact_test.go @@ -0,0 +1,279 @@ +// Copyright (C) 2026 boostsecurity.io +// SPDX-License-Identifier: GPL-3.0-or-later + +package detector + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestSSHPrivateKeyDetector_Redact(t *testing.T) { + t.Parallel() + d := NewSSHPrivateKeyDetector() + + input := "-----BEGIN RSA PRIVATE KEY-----\nMIIBogIBAAJBALRiMLAH9+12345678901234567890ABCDEFGH=\n-----END RSA PRIVATE KEY-----" + out, counts := d.Redact(input) + assert.Equal(t, "[REDACTED-ssh-private-key]", out) + assert.Equal(t, 1, counts["REDACTED-ssh-private-key"]) +} + +func TestSSHPrivateKeyDetector_Redact_NoMatch(t *testing.T) { + t.Parallel() + d := NewSSHPrivateKeyDetector() + + out, counts := d.Redact("no keys here") + assert.Equal(t, "no keys here", out) + assert.Empty(t, counts) +} + +func TestGitHubTokenDetector_Redact(t *testing.T) { + t.Parallel() + d := NewGitHubPATDetector() + + tests := []struct { + name string + input string + want string + label string + }{ + {"classic pat", "ghp_" + repeat('A', 36), "[REDACTED-github-pat]", "REDACTED-github-pat"}, + {"oauth", "gho_" + repeat('B', 36), "[REDACTED-github-oauth]", "REDACTED-github-oauth"}, + {"user", "ghu_" + repeat('C', 36), "[REDACTED-github-user]", "REDACTED-github-user"}, + {"app", "ghs_" + repeat('D', 36), "[REDACTED-github-app]", "REDACTED-github-app"}, + {"fine-grained", "github_pat_" + repeat('E', 22), "[REDACTED-github-fine-pat]", "REDACTED-github-fine-pat"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + out, counts := d.Redact(tt.input) + assert.Equal(t, tt.want, out) + assert.Equal(t, 1, counts[tt.label]) + }) + } +} + +func TestNPMTokenDetector_Redact(t *testing.T) { + t.Parallel() + d := NewNPMTokenDetector() + + out, counts := d.Redact("npm_" + repeat('F', 36)) + assert.Equal(t, "[REDACTED-npm-token]", out) + assert.Equal(t, 1, counts["REDACTED-npm-token"]) +} + +func TestAIServiceDetector_Redact(t *testing.T) { + t.Parallel() + d := NewAIServiceDetector() + + tests := []struct { + name string + input string + want string + label string + }{ + { + "anthropic", + "sk-ant-api03-abcdefghij1234567890-ABCDE", + "[REDACTED-anthropic-key]", + "REDACTED-anthropic-key", + }, + { + "openai sk-proj", + "sk-proj-abcdefghij1234567890-ABC", + "[REDACTED-openai-key]", + "REDACTED-openai-key", + }, + { + "openai generic", + "sk-ABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890123456789", + "[REDACTED-openai-key]", + "REDACTED-openai-key", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + out, counts := d.Redact(tt.input) + assert.Equal(t, tt.want, out) + assert.Positive(t, counts[tt.label]) + }) + } +} + +func TestAIServiceDetector_Redact_AnthropicBeforeGeneric(t *testing.T) { + t.Parallel() + d := NewAIServiceDetector() + + // sk-ant- must not match the generic sk- pattern + input := "sk-ant-api03-abcdefghijklmnopqrstuvwxyz1234567890" + out, _ := d.Redact(input) + assert.Equal(t, "[REDACTED-anthropic-key]", out) +} + +func TestHTTPAuthDetector_Redact(t *testing.T) { + t.Parallel() + d := NewHTTPAuthDetector() + + tests := []struct { + name string + input string + want string + label string + }{ + { + "bearer jwt", + "Bearer eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U", + "Bearer [REDACTED-jwt]", + "REDACTED-jwt", + }, + { + "bearer generic", + "Bearer some-opaque-token-value-that-is-long-enough-here", + "Bearer [REDACTED-bearer-token]", + "REDACTED-bearer-token", + }, + { + "basic auth", + "Basic YWRtaW46cGFzc3dvcmQxMjM0NTY3OA==", + "Basic [REDACTED-basic-auth]", + "REDACTED-basic-auth", + }, + { + "url auth", + "https://admin:s3cretP4ss@example.com/api", + "https://[REDACTED-basic-auth]@example.com/api", + "REDACTED-basic-auth", + }, + { + "api key header", + "X-API-Key: " + repeat('z', 40), + "[REDACTED-api-key-header]", + "REDACTED-api-key-header", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + out, counts := d.Redact(tt.input) + assert.Equal(t, tt.want, out) + assert.Positive(t, counts[tt.label]) + }) + } +} + +func TestCloudCredentialsDetector_Redact(t *testing.T) { + t.Parallel() + d := NewCloudCredentialsDetector() + + tests := []struct { + name string + input string + want string + label string + }{ + {"aws akia", "AKIAIOSFODNN7EXAMPLE", "[REDACTED-aws-access-key]", "REDACTED-aws-access-key"}, + {"aws asia", "ASIA1234567890ABCDEF", "[REDACTED-aws-sts-key]", "REDACTED-aws-sts-key"}, + {"gcp", "AIzaSyA" + repeat('x', 32), "[REDACTED-gcp-api-key]", "REDACTED-gcp-api-key"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + out, counts := d.Redact(tt.input) + assert.Equal(t, tt.want, out) + assert.Equal(t, 1, counts[tt.label]) + }) + } +} + +func TestCloudCredentialsDetector_Redact_AWSSessionToken(t *testing.T) { + t.Parallel() + d := NewCloudCredentialsDetector() + + b64 := longBase64(120) + input := "aws_session_token = " + b64 + out, counts := d.Redact(input) + assert.Equal(t, "aws_session_token = [REDACTED-aws-session-token]", out) + assert.Positive(t, counts["REDACTED-aws-session-token"]) +} + +func TestCloudCredentialsDetector_Redact_AWSSecretKey(t *testing.T) { + t.Parallel() + d := NewCloudCredentialsDetector() + + input := "aws_secret_access_key = wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + out, counts := d.Redact(input) + assert.Equal(t, "aws_secret_access_key = [REDACTED-aws-secret-key]", out) + assert.Equal(t, 1, counts["REDACTED-aws-secret-key"]) +} + +func TestJWTDetector_Redact(t *testing.T) { + t.Parallel() + d := NewJWTDetector() + + input := "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U" + out, counts := d.Redact(input) + assert.Equal(t, "[REDACTED-jwt]", out) + assert.Equal(t, 1, counts["REDACTED-jwt"]) +} + +func TestSplunkTokenDetector_Redact(t *testing.T) { + t.Parallel() + d := NewSplunkTokenDetector() + + input := "splunkd_" + repeat('a', 32) + out, counts := d.Redact(input) + assert.Equal(t, "[REDACTED-splunk-session]", out) + assert.Equal(t, 1, counts["REDACTED-splunk-session"]) +} + +func TestGenericAPIKeyDetector_Redact_NoPatterns(t *testing.T) { + t.Parallel() + d := NewGenericAPIKeyDetector() + + // GenericAPIKeyDetector has no redaction patterns (header redaction is + // in HTTPAuthDetector), so Redact should be a no-op. + input := "some random content" + out, counts := d.Redact(input) + assert.Equal(t, input, out) + assert.Empty(t, counts) +} + +func TestRedactAll_RegistryOrdering(t *testing.T) { + t.Parallel() + + registry := NewRegistry() + registry.Register(NewHTTPAuthDetector()) + registry.Register(NewJWTDetector()) + + // Bearer+JWT should be handled by HTTPAuth, standalone JWT should not + // double-match the already-redacted content + input := "Bearer eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U" + out, counts := registry.RedactAll(input) + assert.Equal(t, "Bearer [REDACTED-jwt]", out) + assert.Equal(t, 1, counts["REDACTED-jwt"]) +} + +// -- helpers -- + +func repeat(c byte, n int) string { + b := make([]byte, n) + for i := range b { + b[i] = c + } + return string(b) +} + +func longBase64(n int) string { + chars := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" + b := make([]byte, n) + for i := range b { + b[i] = chars[i%len(chars)] + } + return string(b) +} diff --git a/pkg/detector/splunk_token.go b/pkg/detector/splunk_token.go new file mode 100644 index 0000000..9dbf076 --- /dev/null +++ b/pkg/detector/splunk_token.go @@ -0,0 +1,72 @@ +// Copyright (C) 2026 boostsecurity.io +// SPDX-License-Identifier: GPL-3.0-or-later + +package detector + +import ( + "fmt" + "regexp" + + "github.com/boostsecurityio/bagel/pkg/models" +) + +// SplunkTokenDetector detects Splunk session tokens +type SplunkTokenDetector struct { + tokenPattern *regexp.Regexp + redactPatterns []RedactPattern +} + +// NewSplunkTokenDetector creates a new Splunk session token detector +func NewSplunkTokenDetector() *SplunkTokenDetector { + pattern := regexp.MustCompile(`\b(splunkd_[A-Za-z0-9]{32,})\b`) + return &SplunkTokenDetector{ + tokenPattern: pattern, + redactPatterns: []RedactPattern{ + { + Regex: pattern, + Replacement: `[REDACTED-splunk-session]`, + Label: "REDACTED-splunk-session", + Prefixes: []string{"splunkd_"}, + }, + }, + } +} + +// Name returns the detector name +func (d *SplunkTokenDetector) Name() string { + return "splunk-token" +} + +// Detect scans content for Splunk session tokens and returns findings +func (d *SplunkTokenDetector) Detect( + content string, + ctx *models.DetectionContext, +) []models.Finding { + matches := d.tokenPattern.FindAllString(content, -1) + findings := make([]models.Finding, 0, len(matches)) + for _, match := range matches { + findings = append(findings, models.Finding{ + ID: "splunk-session-token", + Severity: "critical", + Title: "Splunk Session Token Detected", + Message: fmt.Sprintf( + "A Splunk session token was detected in %s. "+ + "This credential provides authenticated access to Splunk. "+ + "Revoke the session and rotate credentials.", + ctx.FormatSource(), + ), + Path: ctx.Source, + Metadata: map[string]interface{}{ + "detector_name": d.Name(), + "token_type": "splunk-session-token", + "fingerprint": Fingerprint(match), + }, + }) + } + return findings +} + +// Redact replaces Splunk session tokens in content with redaction markers. +func (d *SplunkTokenDetector) Redact(content string) (string, map[string]int) { + return ApplyRedactPatterns(content, d.redactPatterns) +} diff --git a/pkg/detector/ssh_private_key.go b/pkg/detector/ssh_private_key.go index ac22901..0a5019a 100644 --- a/pkg/detector/ssh_private_key.go +++ b/pkg/detector/ssh_private_key.go @@ -13,7 +13,8 @@ import ( // SSHPrivateKeyDetector detects SSH private keys in content type SSHPrivateKeyDetector struct { - keyPattern *regexp.Regexp + keyPattern *regexp.Regexp + redactPatterns []RedactPattern } // NewSSHPrivateKeyDetector creates a new SSH private key detector @@ -24,6 +25,17 @@ func NewSSHPrivateKeyDetector() *SSHPrivateKeyDetector { return &SSHPrivateKeyDetector{ keyPattern: pattern, + redactPatterns: []RedactPattern{ + { + Regex: regexp.MustCompile( + `-----BEGIN\s+(?:RSA\s+|EC\s+|OPENSSH\s+|DSA\s+)?PRIVATE KEY-----` + + `[A-Za-z0-9+/=\s\\n]{20,}` + + `-----END\s+(?:RSA\s+|EC\s+|OPENSSH\s+|DSA\s+)?PRIVATE KEY-----`), + Replacement: `[REDACTED-ssh-private-key]`, + Label: "REDACTED-ssh-private-key", + Prefixes: []string{"-----BEGIN"}, + }, + }, } } @@ -124,6 +136,11 @@ func (d *SSHPrivateKeyDetector) isEncrypted(keyContent string) bool { return false } +// Redact replaces SSH private keys in content with redaction markers. +func (d *SSHPrivateKeyDetector) Redact(content string) (string, map[string]int) { + return ApplyRedactPatterns(content, d.redactPatterns) +} + // createFinding creates a finding for a detected SSH private key func (d *SSHPrivateKeyDetector) createFinding(keyContent, keyType string, isEncrypted bool, ctx *models.DetectionContext) models.Finding { var severity string diff --git a/pkg/probe/ai_cli.go b/pkg/probe/ai_cli.go index b005c20..9ae4667 100644 --- a/pkg/probe/ai_cli.go +++ b/pkg/probe/ai_cli.go @@ -121,7 +121,7 @@ func (p *AICliProbe) Execute(ctx context.Context) ([]models.Finding, error) { // processFile reads and analyzes an AI CLI adjacent file func (p *AICliProbe) processFile(ctx context.Context, filePath string) []models.Finding { - var findings []models.Finding + findings := make([]models.Finding, 0, 4) // Read file content, err := os.ReadFile(filePath) diff --git a/pkg/probe/cloud.go b/pkg/probe/cloud.go index d2b86f9..44831bf 100644 --- a/pkg/probe/cloud.go +++ b/pkg/probe/cloud.go @@ -103,7 +103,7 @@ func (p *CloudProbe) Execute(ctx context.Context) ([]models.Finding, error) { // processCloudFile reads and analyzes a cloud credential file func (p *CloudProbe) processCloudFile(ctx context.Context, filePath string) []models.Finding { - var findings []models.Finding + findings := make([]models.Finding, 0, 4) // Read file content, err := os.ReadFile(filePath) diff --git a/pkg/probe/env.go b/pkg/probe/env.go index 37324d9..60d837d 100644 --- a/pkg/probe/env.go +++ b/pkg/probe/env.go @@ -129,7 +129,7 @@ func (p *EnvProbe) scanShellConfigFiles(ctx context.Context) []models.Finding { // processShellConfigFile reads and analyzes a shell configuration file func (p *EnvProbe) processShellConfigFile(ctx context.Context, filePath string) []models.Finding { - var findings []models.Finding + findings := make([]models.Finding, 0, 4) // Read file contents content, err := os.ReadFile(filePath) @@ -156,10 +156,9 @@ func (p *EnvProbe) processShellConfigFile(ctx context.Context, filePath string) // scanEnvFiles scans .env files for secrets and configuration issues func (p *EnvProbe) scanEnvFiles(ctx context.Context) []models.Finding { - var findings []models.Finding - // Get .env files from file index envFiles := p.fileIndex.Get("env_files") + findings := make([]models.Finding, 0, len(envFiles)) log.Ctx(ctx).Debug(). Int("count", len(envFiles)). @@ -175,7 +174,7 @@ func (p *EnvProbe) scanEnvFiles(ctx context.Context) []models.Finding { // processEnvFile reads and analyzes a .env file func (p *EnvProbe) processEnvFile(ctx context.Context, filePath string) []models.Finding { - var findings []models.Finding + findings := make([]models.Finding, 0, 4) // Read file contents content, err := os.ReadFile(filePath) diff --git a/pkg/probe/gh.go b/pkg/probe/gh.go index 7a3b329..aef0b60 100644 --- a/pkg/probe/gh.go +++ b/pkg/probe/gh.go @@ -37,7 +37,7 @@ func (p *GHProbe) IsEnabled() bool { // Execute runs the GitHub CLI probe func (p *GHProbe) Execute(ctx context.Context) ([]models.Finding, error) { - var findings []models.Finding + findings := make([]models.Finding, 0, 1) // Check if gh CLI is installed ghPath, err := exec.LookPath("gh") diff --git a/pkg/probe/git.go b/pkg/probe/git.go index 18dee9a..e2d0441 100644 --- a/pkg/probe/git.go +++ b/pkg/probe/git.go @@ -42,7 +42,7 @@ func (p *GitProbe) IsEnabled() bool { // Execute runs the Git probe func (p *GitProbe) Execute(ctx context.Context) ([]models.Finding, error) { - var findings []models.Finding + findings := make([]models.Finding, 0, 4) // Get all global git config once cmd := exec.CommandContext(ctx, "git", "config", "--list", "--global") @@ -65,7 +65,7 @@ func (p *GitProbe) Execute(ctx context.Context) ([]models.Finding, error) { // checkGitConfig checks for insecure git configuration settings func (p *GitProbe) checkGitConfig(config map[string]string) []models.Finding { - var findings []models.Finding + findings := make([]models.Finding, 0, 4) // Check for SSL verification disabled findings = append(findings, p.checkSSLVerify(config)...) @@ -324,7 +324,7 @@ func (p *GitProbe) checkHooksPath(config map[string]string) []models.Finding { // scanGitConfigForSecrets scans git config values for embedded secrets func (p *GitProbe) scanGitConfigForSecrets(config map[string]string) []models.Finding { - var findings []models.Finding + findings := make([]models.Finding, 0, len(config)) for configKey, configValue := range config { // Run detectors on config values diff --git a/pkg/probe/npm.go b/pkg/probe/npm.go index 879afb0..02aad9d 100644 --- a/pkg/probe/npm.go +++ b/pkg/probe/npm.go @@ -80,7 +80,7 @@ func (p *NPMProbe) Execute(ctx context.Context) ([]models.Finding, error) { // processConfigFile reads and analyzes a single NPM/Yarn config file func (p *NPMProbe) processConfigFile(ctx context.Context, filePath string) []models.Finding { - var findings []models.Finding + findings := make([]models.Finding, 0, 4) // Read file contents content, err := os.ReadFile(filePath) @@ -153,7 +153,7 @@ func parseNPMConfig(content string) map[string]string { // checkNPMConfig checks for insecure NPM/Yarn configuration settings func (p *NPMProbe) checkNPMConfig(filePath string, config map[string]string) []models.Finding { - var findings []models.Finding + findings := make([]models.Finding, 0, 4) // Check for SSL verification disabled findings = append(findings, p.checkStrictSSL(filePath, config)...) diff --git a/pkg/probe/ssh.go b/pkg/probe/ssh.go index 756db0a..e0c051e 100644 --- a/pkg/probe/ssh.go +++ b/pkg/probe/ssh.go @@ -87,7 +87,7 @@ func (p *SSHProbe) Execute(ctx context.Context) ([]models.Finding, error) { // processSSHConfig reads and analyzes an SSH config file func (p *SSHProbe) processSSHConfig(ctx context.Context, configPath string) []models.Finding { - var findings []models.Finding + findings := make([]models.Finding, 0, 4) // Read config file content, err := os.ReadFile(configPath) diff --git a/pkg/scrubber/scrubber.go b/pkg/scrubber/scrubber.go new file mode 100644 index 0000000..48694d7 --- /dev/null +++ b/pkg/scrubber/scrubber.go @@ -0,0 +1,220 @@ +// Copyright (C) 2026 boostsecurity.io +// SPDX-License-Identifier: GPL-3.0-or-later + +package scrubber + +import ( + "context" + "fmt" + "os" + "path/filepath" + "runtime" + "strings" + + "github.com/boostsecurityio/bagel/pkg/detector" + "github.com/rs/zerolog" + "golang.org/x/sync/errgroup" +) + +// scrubFile reads a file, applies all registry redactions, and writes back. +// Returns whether the file was modified and counts by label. +func scrubFile(path string, registry *detector.Registry) (bool, map[string]int, error) { + data, err := os.ReadFile(path) + if err != nil { + return false, nil, fmt.Errorf("read %s: %w", path, err) + } + + content := string(data) + scrubbed, counts := registry.RedactAll(content) + + if scrubbed == content { + return false, nil, nil + } + + info, err := os.Stat(path) + if err != nil { + return false, nil, fmt.Errorf("stat %s: %w", path, err) + } + + if err := os.WriteFile(path, []byte(scrubbed), info.Mode()); err != nil { + return false, nil, fmt.Errorf("write %s: %w", path, err) + } + + return true, counts, nil +} + +// PreviewInput configures a read-only scrub preview. +type PreviewInput struct { + Files []string + Registry *detector.Registry +} + +// PreviewResult holds the outcome of previewing files for redactable content. +// Files lists the paths that contain redactable content. +type PreviewResult struct { + FilesScanned int + Files []string + Redactions int + CountsByType map[string]int +} + +// ApplyInput configures a scrub apply operation. +type ApplyInput struct { + Files []string + Registry *detector.Registry +} + +// ApplyResult holds the outcome of applying redactions. +type ApplyResult struct { + FilesModified int + Redactions int + CountsByType map[string]int +} + +// fileResult holds the outcome of processing a single file. +type fileResult struct { + changed bool + counts map[string]int +} + +// Preview reads files and counts what would be redacted without writing. +func Preview(ctx context.Context, input PreviewInput) (PreviewResult, error) { + log := zerolog.Ctx(ctx) + result := PreviewResult{CountsByType: make(map[string]int)} + + result.FilesScanned = len(input.Files) + if len(input.Files) == 0 { + log.Info().Msg("No files to preview") + return result, nil + } + + log.Debug().Int("file_count", len(input.Files)).Msg("Previewing files") + + processor := func(path string) (fileResult, error) { + data, err := os.ReadFile(path) + if err != nil { + return fileResult{}, fmt.Errorf("read %s: %w", path, err) + } + _, counts := input.Registry.RedactAll(string(data)) + if len(counts) > 0 { + return fileResult{changed: true, counts: counts}, nil + } + return fileResult{}, nil + } + + results, err := processFilesConcurrently(ctx, input.Files, processor) + if err != nil { + return result, err + } + + for i, fr := range results { + if !fr.changed { + continue + } + result.Files = append(result.Files, input.Files[i]) + mergeCounts(result.CountsByType, fr.counts) + result.Redactions += sumCounts(fr.counts) + } + + return result, nil +} + +// Apply scrubs credential patterns from the given files, writing +// changes back to disk. Call Preview first to discover which files +// need scrubbing. +func Apply(ctx context.Context, input ApplyInput) (ApplyResult, error) { + log := zerolog.Ctx(ctx) + result := ApplyResult{CountsByType: make(map[string]int)} + + if len(input.Files) == 0 { + return result, nil + } + + processor := func(path string) (fileResult, error) { + changed, counts, err := scrubFile(path, input.Registry) + if err != nil { + return fileResult{}, err + } + return fileResult{changed: changed, counts: counts}, nil + } + + results, err := processFilesConcurrently(ctx, input.Files, processor) + if err != nil { + return result, err + } + + for i, fr := range results { + if !fr.changed { + continue + } + result.FilesModified++ + mergeCounts(result.CountsByType, fr.counts) + result.Redactions += sumCounts(fr.counts) + + log.Debug(). + Str("file", filepath.Base(input.Files[i])). + Str("types", formatCounts(fr.counts)). + Msg("Scrubbed") + } + + return result, nil +} + +// fileProcessor is a function that processes a single file and +// returns whether it had redactable content and the counts by type. +type fileProcessor func(path string) (fileResult, error) + +func processFilesConcurrently( + ctx context.Context, + files []string, + process fileProcessor, +) ([]fileResult, error) { + log := zerolog.Ctx(ctx) + + results := make([]fileResult, len(files)) + workers := runtime.GOMAXPROCS(0) + g, ctx := errgroup.WithContext(ctx) + g.SetLimit(workers) + + for i, path := range files { + g.Go(func() error { + if ctx.Err() != nil { + return nil + } + fr, err := process(path) + if err != nil { + log.Warn().Err(err).Str("file", path).Msg("Failed to process file") + return nil + } + results[i] = fr + return nil + }) + } + + if err := g.Wait(); err != nil { + return nil, fmt.Errorf("process files: %w", err) + } + return results, nil +} + +func mergeCounts(dst, src map[string]int) { + for k, v := range src { + dst[k] += v + } +} + +func sumCounts(counts map[string]int) int { + total := 0 + for _, v := range counts { + total += v + } + return total +} + +func formatCounts(counts map[string]int) string { + parts := make([]string, 0, len(counts)) + for k, v := range counts { + parts = append(parts, fmt.Sprintf("%s:%d", k, v)) + } + return strings.Join(parts, ", ") +} diff --git a/pkg/scrubber/scrubber_test.go b/pkg/scrubber/scrubber_test.go new file mode 100644 index 0000000..ef2ccf4 --- /dev/null +++ b/pkg/scrubber/scrubber_test.go @@ -0,0 +1,474 @@ +// Copyright (C) 2026 boostsecurity.io +// SPDX-License-Identifier: GPL-3.0-or-later + +package scrubber + +import ( + "context" + "os" + "path/filepath" + "runtime" + "testing" + + "github.com/boostsecurityio/bagel/pkg/detector" + "github.com/rs/zerolog" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// newTestRegistry builds the same registry used by the scrub command. +func newTestRegistry() *detector.Registry { + registry := detector.NewRegistry() + registry.Register(detector.NewSSHPrivateKeyDetector()) + registry.Register(detector.NewHTTPAuthDetector()) + registry.Register(detector.NewAIServiceDetector()) + registry.Register(detector.NewCloudCredentialsDetector()) + registry.Register(detector.NewSplunkTokenDetector()) + registry.Register(detector.NewGitHubPATDetector()) + registry.Register(detector.NewNPMTokenDetector()) + registry.Register(detector.NewJWTDetector()) + registry.Register(detector.NewGenericAPIKeyDetector()) + return registry +} + +func TestRedactAll_Patterns(t *testing.T) { + t.Parallel() + registry := newTestRegistry() + + tests := []struct { + name string + input string + wantLabel string + wantOut string + }{ + // 1. SSH private key + { + name: "ssh private key RSA", + input: "-----BEGIN RSA PRIVATE KEY-----\nMIIBogIBAAJBALRiMLAH9+12345678901234567890ABCDEFGH=\n-----END RSA PRIVATE KEY-----", + wantLabel: "REDACTED-ssh-private-key", + wantOut: "[REDACTED-ssh-private-key]", + }, + { + name: "ssh private key OPENSSH", + input: "-----BEGIN OPENSSH PRIVATE KEY-----\nb3BlbnNzaC1rZXktdjEAAAAA+/==\n-----END OPENSSH PRIVATE KEY-----", + wantLabel: "REDACTED-ssh-private-key", + wantOut: "[REDACTED-ssh-private-key]", + }, + // 2. Bearer + JWT + { + name: "bearer jwt", + input: "Bearer eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U", + wantLabel: "REDACTED-jwt", + wantOut: "Bearer [REDACTED-jwt]", + }, + // 3. Bearer + non-JWT token + { + name: "bearer generic token", + input: "Bearer some-opaque-token-value-that-is-long-enough-here", + wantLabel: "REDACTED-bearer-token", + wantOut: "Bearer [REDACTED-bearer-token]", + }, + // 4. Basic auth header + { + name: "basic auth header", + input: "Basic YWRtaW46cGFzc3dvcmQxMjM0NTY3OA==", + wantLabel: "REDACTED-basic-auth", + wantOut: "Basic [REDACTED-basic-auth]", + }, + // 5. Anthropic API key + { + name: "anthropic key", + input: "sk-ant-api03-abcdefghij1234567890-ABCDE", + wantLabel: "REDACTED-anthropic-key", + wantOut: "[REDACTED-anthropic-key]", + }, + // 6. OpenAI API key (new format) + { + name: "openai key sk-proj", + input: "sk-proj-abcdefghij1234567890-ABC", + wantLabel: "REDACTED-openai-key", + wantOut: "[REDACTED-openai-key]", + }, + // 7. Generic OpenAI key (older) + { + name: "openai key generic sk-", + input: "sk-ABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890123456789", + wantLabel: "REDACTED-openai-key", + wantOut: "[REDACTED-openai-key]", + }, + // 8. AWS access key (AKIA) + { + name: "aws access key AKIA", + input: "AKIAIOSFODNN7EXAMPLE", + wantLabel: "REDACTED-aws-access-key", + wantOut: "[REDACTED-aws-access-key]", + }, + // 9. AWS STS key (ASIA) + { + name: "aws sts key ASIA", + input: "ASIA1234567890ABCDEF", + wantLabel: "REDACTED-aws-sts-key", + wantOut: "[REDACTED-aws-sts-key]", + }, + // 10. AWS session token (labeled) + { + name: "aws session token labeled", + input: `aws_session_token = ` + longBase64(120), + wantLabel: "REDACTED-aws-session-token", + wantOut: `aws_session_token = [REDACTED-aws-session-token]`, + }, + // 11. AWS STS session token (label-free) + { + name: "aws sts session token prefix", + input: "IQoJb3JpZ2lu" + longBase64(120), + wantLabel: "REDACTED-aws-session-token", + wantOut: "[REDACTED-aws-session-token]", + }, + // 12. AWS secret access key (labeled) + { + name: "aws secret key labeled", + input: `aws_secret_access_key = wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY`, + wantLabel: "REDACTED-aws-secret-key", + wantOut: `aws_secret_access_key = [REDACTED-aws-secret-key]`, + }, + // 13. Splunk session + { + name: "splunk session token", + input: "splunkd_" + repeatChar('a', 32), + wantLabel: "REDACTED-splunk-session", + wantOut: "[REDACTED-splunk-session]", + }, + // 14. GitHub PAT + { + name: "github classic pat", + input: "ghp_" + repeatChar('A', 36), + wantLabel: "REDACTED-github-pat", + wantOut: "[REDACTED-github-pat]", + }, + // 15. GitHub OAuth + { + name: "github oauth token", + input: "gho_" + repeatChar('B', 36), + wantLabel: "REDACTED-github-oauth", + wantOut: "[REDACTED-github-oauth]", + }, + // 16. GitHub user token + { + name: "github user token", + input: "ghu_" + repeatChar('C', 36), + wantLabel: "REDACTED-github-user", + wantOut: "[REDACTED-github-user]", + }, + // 17. GitHub app token + { + name: "github app token", + input: "ghs_" + repeatChar('D', 36), + wantLabel: "REDACTED-github-app", + wantOut: "[REDACTED-github-app]", + }, + // 18. GitHub fine-grained PAT + { + name: "github fine-grained pat", + input: "github_pat_" + repeatChar('E', 22), + wantLabel: "REDACTED-github-fine-pat", + wantOut: "[REDACTED-github-fine-pat]", + }, + // 19. NPM token + { + name: "npm token", + input: "npm_" + repeatChar('F', 36), + wantLabel: "REDACTED-npm-token", + wantOut: "[REDACTED-npm-token]", + }, + // 20. Basic auth in URLs + { + name: "basic auth in url", + input: "https://admin:s3cretP4ss@example.com/api", + wantLabel: "REDACTED-basic-auth", + wantOut: "https://[REDACTED-basic-auth]@example.com/api", + }, + // 21. Standalone JWT + { + name: "standalone jwt", + input: "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U", + wantLabel: "REDACTED-jwt", + wantOut: "[REDACTED-jwt]", + }, + // 22. Azure storage key + { + name: "azure storage key", + input: `AccountKey=` + longBase64(86) + "==", + wantLabel: "REDACTED-azure-storage-key", + wantOut: `AccountKey=[REDACTED-azure-storage-key]`, + }, + // 23. GCP API key + { + name: "gcp api key", + input: "AIzaSyA" + repeatChar('x', 32), + wantLabel: "REDACTED-gcp-api-key", + wantOut: "[REDACTED-gcp-api-key]", + }, + // 24. API key header + { + name: "x-api-key header", + input: `X-API-Key: ` + repeatChar('z', 40), + wantLabel: "REDACTED-api-key-header", + wantOut: "[REDACTED-api-key-header]", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + out, counts := registry.RedactAll(tt.input) + assert.Equal(t, tt.wantOut, out, "scrubbed output mismatch") + assert.Positive(t, counts[tt.wantLabel], "expected label %s in counts", tt.wantLabel) + }) + } +} + +func TestRedactAll_NegativeCases(t *testing.T) { + t.Parallel() + registry := newTestRegistry() + + tests := []struct { + name string + input string + }{ + {name: "short string", input: "hello world"}, + {name: "sk- too short", input: "sk-abc123"}, + {name: "AKIA too short", input: "AKIA123"}, + {name: "ghp_ too short", input: "ghp_short"}, + {name: "npm_ too short", input: "npm_short"}, + {name: "not a jwt", input: "eyJnotvalid"}, + {name: "normal url no creds", input: "https://example.com/path"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + out, counts := registry.RedactAll(tt.input) + assert.Equal(t, tt.input, out, "should not modify non-matching input") + assert.Empty(t, counts, "should have zero counts") + }) + } +} + +func TestRedactAll_OrderingBearerJWT(t *testing.T) { + t.Parallel() + registry := newTestRegistry() + + // Bearer + JWT should match as "Bearer [REDACTED-jwt]", not as + // "Bearer [REDACTED-bearer-token]" followed by standalone JWT redaction. + input := "Bearer eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U" + out, counts := registry.RedactAll(input) + + assert.Equal(t, "Bearer [REDACTED-jwt]", out) + assert.Equal(t, 1, counts["REDACTED-jwt"]) + assert.Zero(t, counts["REDACTED-bearer-token"], "bearer-token should not match when JWT matches first") +} + +func TestRedactAll_AnthropicBeforeGenericSK(t *testing.T) { + t.Parallel() + registry := newTestRegistry() + + // sk-ant- should match as anthropic, not generic openai + input := "sk-ant-api03-abcdefghijklmnopqrstuvwxyz1234567890" + out, _ := registry.RedactAll(input) + assert.Equal(t, "[REDACTED-anthropic-key]", out) +} + +func TestRedactAll_MultipleSecrets(t *testing.T) { + t.Parallel() + registry := newTestRegistry() + + input := "key1=AKIAIOSFODNN7EXAMPLE key2=ghp_" + repeatChar('X', 36) + out, counts := registry.RedactAll(input) + + assert.Contains(t, out, "[REDACTED-aws-access-key]") + assert.Contains(t, out, "[REDACTED-github-pat]") + assert.Equal(t, 1, counts["REDACTED-aws-access-key"]) + assert.Equal(t, 1, counts["REDACTED-github-pat"]) +} + +func TestRedactAll_JSONEmbedded(t *testing.T) { + t.Parallel() + registry := newTestRegistry() + + // Secrets often appear inside JSON strings in JSONL logs + input := `{"token":"ghp_` + repeatChar('Z', 36) + `","user":"alice"}` + out, counts := registry.RedactAll(input) + + assert.Contains(t, out, "[REDACTED-github-pat]") + assert.Equal(t, 1, counts["REDACTED-github-pat"]) + assert.Contains(t, out, `"user":"alice"`) +} + +func TestScrubFile(t *testing.T) { + t.Parallel() + registry := newTestRegistry() + + dir := t.TempDir() + path := filepath.Join(dir, "test.jsonl") + + content := `{"key":"AKIAIOSFODNN7EXAMPLE","data":"safe"}` + require.NoError(t, os.WriteFile(path, []byte(content), 0600)) + + changed, counts, err := scrubFile(path, registry) + require.NoError(t, err) + assert.True(t, changed) + assert.Equal(t, 1, counts["REDACTED-aws-access-key"]) + + // Verify file was actually modified + data, err := os.ReadFile(path) + require.NoError(t, err) + assert.Contains(t, string(data), "[REDACTED-aws-access-key]") + assert.NotContains(t, string(data), "AKIAIOSFODNN7EXAMPLE") +} + +func TestScrubFile_NoChanges(t *testing.T) { + t.Parallel() + registry := newTestRegistry() + + dir := t.TempDir() + path := filepath.Join(dir, "clean.jsonl") + + require.NoError(t, os.WriteFile(path, []byte(`{"safe":"data"}`), 0600)) + + changed, counts, err := scrubFile(path, registry) + require.NoError(t, err) + assert.False(t, changed) + assert.Nil(t, counts) +} + +func TestScrubFile_PreservesPermissions(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("Unix file permissions not supported on Windows") + } + t.Parallel() + registry := newTestRegistry() + + dir := t.TempDir() + path := filepath.Join(dir, "perms.jsonl") + + require.NoError(t, os.WriteFile(path, []byte("AKIAIOSFODNN7EXAMPLE"), 0640)) + + _, _, err := scrubFile(path, registry) + require.NoError(t, err) + + info, err := os.Stat(path) + require.NoError(t, err) + assert.Equal(t, os.FileMode(0640), info.Mode().Perm()) +} + +func TestPreview_FindsSecrets(t *testing.T) { + t.Parallel() + registry := newTestRegistry() + + dir := t.TempDir() + path := filepath.Join(dir, "test.jsonl") + content := `{"key":"AKIAIOSFODNN7EXAMPLE"}` + require.NoError(t, os.WriteFile(path, []byte(content), 0600)) + + ctx := zerolog.Nop().WithContext(context.Background()) + result, err := Preview(ctx, PreviewInput{ + Files: []string{path}, + Registry: registry, + }) + require.NoError(t, err) + + assert.Equal(t, 1, result.FilesScanned) + assert.Equal(t, []string{path}, result.Files) + assert.Equal(t, 1, result.Redactions) + + // Verify file was NOT modified (preview never writes) + data, err := os.ReadFile(path) + require.NoError(t, err) + assert.Equal(t, content, string(data)) +} + +func TestPreview_NoSecrets(t *testing.T) { + t.Parallel() + registry := newTestRegistry() + + dir := t.TempDir() + path := filepath.Join(dir, "clean.jsonl") + require.NoError(t, os.WriteFile(path, []byte(`{"safe":"data"}`), 0600)) + + ctx := zerolog.Nop().WithContext(context.Background()) + result, err := Preview(ctx, PreviewInput{ + Files: []string{path}, + Registry: registry, + }) + require.NoError(t, err) + + assert.Equal(t, 1, result.FilesScanned) + assert.Empty(t, result.Files) + assert.Equal(t, 0, result.Redactions) +} + +func TestPreview_EmptyFileList(t *testing.T) { + t.Parallel() + + ctx := zerolog.Nop().WithContext(context.Background()) + result, err := Preview(ctx, PreviewInput{ + Registry: newTestRegistry(), + }) + require.NoError(t, err) + assert.Equal(t, 0, result.FilesScanned) + assert.Empty(t, result.Files) +} + +func TestApply_ScrubsFiles(t *testing.T) { + t.Parallel() + registry := newTestRegistry() + + dir := t.TempDir() + path := filepath.Join(dir, "test.jsonl") + content := `{"key":"AKIAIOSFODNN7EXAMPLE"}` + require.NoError(t, os.WriteFile(path, []byte(content), 0600)) + + ctx := zerolog.Nop().WithContext(context.Background()) + result, err := Apply(ctx, ApplyInput{ + Files: []string{path}, + Registry: registry, + }) + require.NoError(t, err) + + assert.Equal(t, 1, result.FilesModified) + assert.Equal(t, 1, result.Redactions) + + // Verify file WAS modified + data, err := os.ReadFile(path) + require.NoError(t, err) + assert.Contains(t, string(data), "[REDACTED-aws-access-key]") +} + +func TestApply_EmptyFileList(t *testing.T) { + t.Parallel() + + ctx := zerolog.Nop().WithContext(context.Background()) + result, err := Apply(ctx, ApplyInput{Registry: newTestRegistry()}) + require.NoError(t, err) + assert.Equal(t, 0, result.FilesModified) +} + +// -- helpers -- + +func repeatChar(c byte, n int) string { + b := make([]byte, n) + for i := range b { + b[i] = c + } + return string(b) +} + +func longBase64(n int) string { + // Produces a string of valid base64 characters of length n + chars := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" + b := make([]byte, n) + for i := range b { + b[i] = chars[i%len(chars)] + } + return string(b) +}