diff --git a/Makefile b/Makefile index f246508f..8b37ef9c 100644 --- a/Makefile +++ b/Makefile @@ -73,6 +73,9 @@ test: $(SOURCES) github.com/junegunn/fzf/src/tui \ github.com/junegunn/fzf/src/util +bench: + cd src && SHELL=/bin/sh GOOS= $(GO) test -v -tags "$(TAGS)" -run=Bench -bench=. -benchmem + install: bin/fzf build: @@ -153,4 +156,4 @@ update: $(GO) get -u $(GO) mod tidy -.PHONY: all build release test install clean docker docker-test update +.PHONY: all build release test bench install clean docker docker-test update diff --git a/src/ansi.go b/src/ansi.go index b4f98f6d..a67ac0b9 100644 --- a/src/ansi.go +++ b/src/ansi.go @@ -1,8 +1,6 @@ package fzf import ( - "bytes" - "regexp" "strconv" "strings" "unicode/utf8" @@ -82,73 +80,154 @@ func toAnsiString(color tui.Color, offset int) string { return ret + ";" } -var ansiRegex *regexp.Regexp - -func init() { - /* - References: - - https://github.com/gnachman/iTerm2 - - http://ascii-table.com/ansi-escape-sequences.php - - http://ascii-table.com/ansi-escape-sequences-vt-100.php - - http://tldp.org/HOWTO/Bash-Prompt-HOWTO/x405.html - - https://invisible-island.net/xterm/ctlseqs/ctlseqs.html - */ - // The following regular expression will include not all but most of the - // frequently used ANSI sequences - ansiRegex = regexp.MustCompile("(?:\x1b[\\[()][0-9;]*[a-zA-Z@]|\x1b][0-9];[[:print:]]+(?:\x1b\\\\|\x07)|\x1b.|[\x0e\x0f]|.\x08)") +func isPrint(c uint8) bool { + return '\x20' <= c && c <= '\x7e' } -func findAnsiStart(str string) int { - idx := 0 - for ; idx < len(str); idx++ { - b := str[idx] - if b == 0x1b || b == 0x0e || b == 0x0f { - return idx +func matchOperatingSystemCommand(s string) int { + // `\x1b][0-9];[[:print:]]+(?:\x1b\\\\|\x07)` + // ^ match starting here + // + i := 5 // prefix matched in nextAnsiEscapeSequence() + for ; i < len(s) && isPrint(s[i]); i++ { + } + if i < len(s) { + if s[i] == '\x07' { + return i + 1 + } + if s[i] == '\x1b' && i < len(s)-1 && s[i+1] == '\\' { + return i + 2 } - if b == 0x08 && idx > 0 { - return idx - 1 + } + return -1 +} + +func matchControlSequence(s string) int { + // `\x1b[\\[()][0-9;]*[a-zA-Z@]` + // ^ match starting here + // + i := 2 // prefix matched in nextAnsiEscapeSequence() + for ; i < len(s) && (isNumeric(s[i]) || s[i] == ';'); i++ { + } + if i < len(s) { + c := s[i] + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '@' { + return i + 1 } } - return idx + return -1 +} + +func isCtrlSeqStart(c uint8) bool { + return c == '\\' || c == '[' || c == '(' || c == ')' +} + +// nextAnsiEscapeSequence returns the ANSI escape sequence and is equivalent to +// calling FindStringIndex() on the below regex (which was originally used): +// +// "(?:\x1b[\\[()][0-9;]*[a-zA-Z@]|\x1b][0-9];[[:print:]]+(?:\x1b\\\\|\x07)|\x1b.|[\x0e\x0f]|.\x08)" +// +func nextAnsiEscapeSequence(s string) (int, int) { + // fast check for ANSI escape sequences + i := 0 + for ; i < len(s); i++ { + switch s[i] { + case '\x0e', '\x0f', '\x1b', '\x08': + // We ignore the fact that '\x08' cannot be the first char + // in the string and be an escape sequence for the sake of + // speed and simplicity. + goto Loop + } + } + return -1, -1 + +Loop: + for ; i < len(s); i++ { + switch s[i] { + case '\x08': + // backtrack to match: `.\x08` + if i > 0 && s[i-1] != '\n' { + if s[i-1] < utf8.RuneSelf { + return i - 1, i + 1 + } + _, n := utf8.DecodeLastRuneInString(s[:i]) + return i - n, i + 1 + } + case '\x1b': + // match: `\x1b[\\[()][0-9;]*[a-zA-Z@]` + if i+2 < len(s) && isCtrlSeqStart(s[i+1]) { + if j := matchControlSequence(s[i:]); j != -1 { + return i, i + j + } + } + + // match: `\x1b][0-9];[[:print:]]+(?:\x1b\\\\|\x07)` + if i+5 < len(s) && s[i+1] == ']' && isNumeric(s[i+2]) && + s[i+3] == ';' && isPrint(s[i+4]) { + + if j := matchOperatingSystemCommand(s[i:]); j != -1 { + return i, i + j + } + } + + // match: `\x1b.` + if i+1 < len(s) && s[i+1] != '\n' { + if s[i+1] < utf8.RuneSelf { + return i, i + 2 + } + _, n := utf8.DecodeRuneInString(s[i+1:]) + return i, i + n + 1 + } + case '\x0e', '\x0f': + // match: `[\x0e\x0f]` + return i, i + 1 + } + } + return -1, -1 } func extractColor(str string, state *ansiState, proc func(string, *ansiState) bool) (string, *[]ansiOffset, *ansiState) { - var offsets []ansiOffset - var output bytes.Buffer + // We append to a stack allocated variable that we'll + // later copy and return, to save on allocations. + offsets := make([]ansiOffset, 0, 32) if state != nil { offsets = append(offsets, ansiOffset{[2]int32{0, 0}, *state}) } - prevIdx := 0 - runeCount := 0 + var ( + pstate *ansiState // lazily allocated + output strings.Builder + prevIdx int + runeCount int + ) for idx := 0; idx < len(str); { - idx += findAnsiStart(str[idx:]) - if idx == len(str) { - break - } - // Make sure that we found an ANSI code - offset := ansiRegex.FindStringIndex(str[idx:]) - if len(offset) < 2 { - idx++ - continue + start, end := nextAnsiEscapeSequence(str[idx:]) + if start == -1 { + break } - offset[0] += idx - offset[1] += idx - idx = offset[1] + start += idx + idx += end // Check if we should continue - prev := str[prevIdx:offset[0]] + prev := str[prevIdx:start] if proc != nil && !proc(prev, state) { return "", nil, nil } + prevIdx = idx - prevIdx = offset[1] - runeCount += utf8.RuneCountInString(prev) - output.WriteString(prev) + if len(prev) != 0 { + runeCount += utf8.RuneCountInString(prev) + // Grow the buffer size to the maximum possible length (string length + // containing ansi codes) to avoid repetitive allocation + if output.Cap() == 0 { + output.Grow(len(str)) + } + output.WriteString(prev) + } - newState := interpretCode(str[offset[0]:offset[1]], state) + newState := interpretCode(str[start:idx], state) if !newState.equals(state) { if state != nil { // Update last offset @@ -157,8 +236,15 @@ func extractColor(str string, state *ansiState, proc func(string, *ansiState) bo if newState.colored() { // Append new offset - state = newState - offsets = append(offsets, ansiOffset{[2]int32{int32(runeCount), int32(runeCount)}, *state}) + if pstate == nil { + pstate = &ansiState{} + } + *pstate = newState + state = pstate + offsets = append(offsets, ansiOffset{ + [2]int32{int32(runeCount), int32(runeCount)}, + newState, + }) } else { // Discard state state = nil @@ -168,7 +254,6 @@ func extractColor(str string, state *ansiState, proc func(string, *ansiState) bo var rest string var trimmed string - if prevIdx == 0 { // No ANSI code found rest = str @@ -178,51 +263,75 @@ func extractColor(str string, state *ansiState, proc func(string, *ansiState) bo output.WriteString(rest) trimmed = output.String() } - if len(rest) > 0 && state != nil { - // Update last offset - runeCount += utf8.RuneCountInString(rest) - (&offsets[len(offsets)-1]).offset[1] = int32(runeCount) - } if proc != nil { proc(rest, state) } - if len(offsets) == 0 { - return trimmed, nil, state + if len(offsets) > 0 { + if len(rest) > 0 && state != nil { + // Update last offset + runeCount += utf8.RuneCountInString(rest) + (&offsets[len(offsets)-1]).offset[1] = int32(runeCount) + } + // Return a copy of the offsets slice + a := make([]ansiOffset, len(offsets)) + copy(a, offsets) + return trimmed, &a, state } - return trimmed, &offsets, state + return trimmed, nil, state +} + +func parseAnsiCode(s string) (int, string) { + var remaining string + if i := strings.IndexByte(s, ';'); i >= 0 { + remaining = s[i+1:] + s = s[:i] + } + + if len(s) > 0 { + // Inlined version of strconv.Atoi() that only handles positive + // integers and does not allocate on error. + code := 0 + for _, ch := range []byte(s) { + ch -= '0' + if ch > 9 { + return -1, remaining + } + code = code*10 + int(ch) + } + return code, remaining + } + + return -1, remaining } -func interpretCode(ansiCode string, prevState *ansiState) *ansiState { - // State - var state *ansiState +func interpretCode(ansiCode string, prevState *ansiState) ansiState { + var state ansiState if prevState == nil { - state = &ansiState{-1, -1, 0, -1} + state = ansiState{-1, -1, 0, -1} } else { - state = &ansiState{prevState.fg, prevState.bg, prevState.attr, prevState.lbg} + state = ansiState{prevState.fg, prevState.bg, prevState.attr, prevState.lbg} } if ansiCode[0] != '\x1b' || ansiCode[1] != '[' || ansiCode[len(ansiCode)-1] != 'm' { - if strings.HasSuffix(ansiCode, "0K") && prevState != nil { + if prevState != nil && strings.HasSuffix(ansiCode, "0K") { state.lbg = prevState.bg } return state } - ptr := &state.fg - state256 := 0 - - init := func() { + if len(ansiCode) <= 3 { state.fg = -1 state.bg = -1 state.attr = 0 - state256 = 0 + return state } - ansiCode = ansiCode[2 : len(ansiCode)-1] - if len(ansiCode) == 0 { - init() - } - for _, code := range strings.Split(ansiCode, ";") { - if num, err := strconv.Atoi(code); err == nil { + + state256 := 0 + ptr := &state.fg + + for len(ansiCode) != 0 { + var num int + if num, ansiCode = parseAnsiCode(ansiCode); num != -1 { switch state256 { case 0: switch num { @@ -253,7 +362,10 @@ func interpretCode(ansiCode string, prevState *ansiState) *ansiState { case 24: // tput rmul state.attr = state.attr &^ tui.Underline case 0: - init() + state.fg = -1 + state.bg = -1 + state.attr = 0 + state256 = 0 default: if num >= 30 && num <= 37 { state.fg = tui.Color(num - 30) @@ -289,6 +401,7 @@ func interpretCode(ansiCode string, prevState *ansiState) *ansiState { } } } + if state256 > 0 { *ptr = -1 } diff --git a/src/ansi_test.go b/src/ansi_test.go index 497eca44..36ed0c7d 100644 --- a/src/ansi_test.go +++ b/src/ansi_test.go @@ -2,12 +2,190 @@ package fzf import ( "fmt" + "math/rand" + "regexp" "strings" "testing" + "unicode/utf8" "github.com/junegunn/fzf/src/tui" ) +// The following regular expression will include not all but most of the +// frequently used ANSI sequences. This regex is used as a reference for +// testing nextAnsiEscapeSequence(). +// +// References: +// - https://github.com/gnachman/iTerm2 +// - http://ascii-table.com/ansi-escape-sequences.php +// - http://ascii-table.com/ansi-escape-sequences-vt-100.php +// - http://tldp.org/HOWTO/Bash-Prompt-HOWTO/x405.html +// - https://invisible-island.net/xterm/ctlseqs/ctlseqs.html +var ansiRegexRefence = regexp.MustCompile("(?:\x1b[\\[()][0-9;]*[a-zA-Z@]|\x1b][0-9];[[:print:]]+(?:\x1b\\\\|\x07)|\x1b.|[\x0e\x0f]|.\x08)") + +func testParserReference(t testing.TB, str string) { + t.Helper() + + toSlice := func(start, end int) []int { + if start == -1 { + return nil + } + return []int{start, end} + } + + s := str + for i := 0; ; i++ { + got := toSlice(nextAnsiEscapeSequence(s)) + exp := ansiRegexRefence.FindStringIndex(s) + + equal := len(got) == len(exp) + if equal { + for i := 0; i < len(got); i++ { + if got[i] != exp[i] { + equal = false + break + } + } + } + if !equal { + var exps, gots []rune + if len(got) == 2 { + gots = []rune(s[got[0]:got[1]]) + } + if len(exp) == 2 { + exps = []rune(s[exp[0]:exp[1]]) + } + t.Errorf("%d: %q: got: %v (%q) want: %v (%q)", i, s, got, gots, exp, exps) + return + } + if len(exp) == 0 { + return + } + s = s[exp[1]:] + } +} + +func TestNextAnsiEscapeSequence(t *testing.T) { + testStrs := []string{ + "\x1b[0mhello world", + "\x1b[1mhello world", + "椙\x1b[1m椙", + "椙\x1b[1椙m椙", + "\x1b[1mhello \x1b[mw\x1b7o\x1b8r\x1b(Bl\x1b[2@d", + "\x1b[1mhello \x1b[Kworld", + "hello \x1b[34;45;1mworld", + "hello \x1b[34;45;1mwor\x1b[34;45;1mld", + "hello \x1b[34;45;1mwor\x1b[0mld", + "hello \x1b[34;48;5;233;1mwo\x1b[38;5;161mr\x1b[0ml\x1b[38;5;161md", + "hello \x1b[38;5;38;48;5;48;1mwor\x1b[38;5;48;48;5;38ml\x1b[0md", + "hello \x1b[32;1mworld", + "hello world", + "hello \x1b[0;38;5;200;48;5;100mworld", + "\x1b椙", + "椙\x08", + "\n\x08", + "X\x08", + "", + "\x1b]4;3;rgb:aa/bb/cc\x07 ", + "\x1b]4;3;rgb:aa/bb/cc\x1b\\ ", + ansiBenchmarkString, + } + + for _, s := range testStrs { + testParserReference(t, s) + } +} + +func TestNextAnsiEscapeSequence_Fuzz_Modified(t *testing.T) { + t.Parallel() + if testing.Short() { + t.Skip("short test") + } + + testStrs := []string{ + "\x1b[0mhello world", + "\x1b[1mhello world", + "椙\x1b[1m椙", + "椙\x1b[1椙m椙", + "\x1b[1mhello \x1b[mw\x1b7o\x1b8r\x1b(Bl\x1b[2@d", + "\x1b[1mhello \x1b[Kworld", + "hello \x1b[34;45;1mworld", + "hello \x1b[34;45;1mwor\x1b[34;45;1mld", + "hello \x1b[34;45;1mwor\x1b[0mld", + "hello \x1b[34;48;5;233;1mwo\x1b[38;5;161mr\x1b[0ml\x1b[38;5;161md", + "hello \x1b[38;5;38;48;5;48;1mwor\x1b[38;5;48;48;5;38ml\x1b[0md", + "hello \x1b[32;1mworld", + "hello world", + "hello \x1b[0;38;5;200;48;5;100mworld", + ansiBenchmarkString, + } + + replacementBytes := [...]rune{'\x0e', '\x0f', '\x1b', '\x08'} + + modifyString := func(s string, rr *rand.Rand) string { + n := rr.Intn(len(s)) + b := []rune(s) + for ; n >= 0 && len(b) != 0; n-- { + i := rr.Intn(len(b)) + switch x := rr.Intn(4); x { + case 0: + b = append(b[:i], b[i+1:]...) + case 1: + j := rr.Intn(len(replacementBytes) - 1) + b[i] = replacementBytes[j] + case 2: + x := rune(rr.Intn(utf8.MaxRune)) + for !utf8.ValidRune(x) { + x = rune(rr.Intn(utf8.MaxRune)) + } + b[i] = x + case 3: + b[i] = rune(rr.Intn(utf8.MaxRune)) // potentially invalid + default: + t.Fatalf("unsupported value: %d", x) + } + } + return string(b) + } + + rr := rand.New(rand.NewSource(1)) + for _, s := range testStrs { + for i := 1_000; i >= 0; i-- { + testParserReference(t, modifyString(s, rr)) + } + } +} + +func TestNextAnsiEscapeSequence_Fuzz_Random(t *testing.T) { + t.Parallel() + + if testing.Short() { + t.Skip("short test") + } + + randomString := func(rr *rand.Rand) string { + numChars := rand.Intn(50) + codePoints := make([]rune, numChars) + for i := 0; i < len(codePoints); i++ { + var r rune + for n := 0; n < 1000; n++ { + r = rune(rr.Intn(utf8.MaxRune)) + // Allow 10% of runes to be invalid + if utf8.ValidRune(r) || rr.Float64() < 0.10 { + break + } + } + codePoints[i] = r + } + return string(codePoints) + } + + rr := rand.New(rand.NewSource(1)) + for i := 0; i < 100_000; i++ { + testParserReference(t, randomString(rr)) + } +} + func TestExtractColor(t *testing.T) { assert := func(offset ansiOffset, b int32, e int32, fg tui.Color, bg tui.Color, bold bool) { var attr tui.Attr @@ -185,3 +363,64 @@ func TestAnsiCodeStringConversion(t *testing.T) { &ansiState{attr: tui.Dim | tui.Italic, fg: 1, bg: 1}, "\x1b[2;3;7;38;2;10;20;30;48;5;100m") } + +func TestParseAnsiCode(t *testing.T) { + tests := []struct { + In, Exp string + N int + }{ + {"123", "", 123}, + {"1a", "", -1}, + {"1a;12", "12", -1}, + {"12;a", "a", 12}, + {"-2", "", -1}, + } + for _, x := range tests { + n, s := parseAnsiCode(x.In) + if n != x.N || s != x.Exp { + t.Fatalf("%q: got: (%d %q) want: (%d %q)", x.In, n, s, x.N, x.Exp) + } + } +} + +// kernel/bpf/preload/iterators/README +const ansiBenchmarkString = "\x1b[38;5;81m\x1b[01;31m\x1b[Kkernel/\x1b[0m\x1b[38;5;81mbpf/" + + "\x1b[0m\x1b[38;5;81mpreload/\x1b[0m\x1b[38;5;81miterators/" + + "\x1b[0m\x1b[38;5;149mMakefile\x1b[m\x1b[K\x1b[0m" + +func BenchmarkNextAnsiEscapeSequence(b *testing.B) { + b.SetBytes(int64(len(ansiBenchmarkString))) + for i := 0; i < b.N; i++ { + s := ansiBenchmarkString + for { + _, o := nextAnsiEscapeSequence(s) + if o == -1 { + break + } + s = s[o:] + } + } +} + +// Baseline test to compare the speed of nextAnsiEscapeSequence() to the +// previously used regex based implementation. +func BenchmarkNextAnsiEscapeSequence_Regex(b *testing.B) { + b.SetBytes(int64(len(ansiBenchmarkString))) + for i := 0; i < b.N; i++ { + s := ansiBenchmarkString + for { + a := ansiRegexRefence.FindStringIndex(s) + if len(a) == 0 { + break + } + s = s[a[1]:] + } + } +} + +func BenchmarkExtractColor(b *testing.B) { + b.SetBytes(int64(len(ansiBenchmarkString))) + for i := 0; i < b.N; i++ { + extractColor(ansiBenchmarkString, nil, nil) + } +}