From 7a1e73c11d686a198482abd5e8745b454c655d84 Mon Sep 17 00:00:00 2001 From: gfw-report Date: Sat, 20 Jul 2024 00:00:00 +0000 Subject: [PATCH] Support accent characters in cite names. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A new commit introduces a cite name that contains an accent character: @inproceedings{Müller2024a, It will cause the bibtex parser to fail with the following two errors: failed to extract cite name of: @inproceedings{Müller2024a, and parse failed at 55:17: syntax error: unexpected $end, expecting tCOMM The second error is an upstream limitation, which I will try to get it merged to the upstream: https://github.com/nickng/bibtex. --- src/main.go | 5 +++-- src/vendor/github.com/nickng/bibtex/token.go | 12 +++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/main.go b/src/main.go index d35de8c..08ffd86 100644 --- a/src/main.go +++ b/src/main.go @@ -13,8 +13,9 @@ import ( "github.com/nickng/bibtex" ) -// Matches e.g.: @inproceedings{Doe2024a, -var re = regexp.MustCompile(`@[a-z]*\{([A-Za-z\-]*[0-9]{4}[a-z]),`) +// Matches e.g.: @inproceedings{Müller2024a, +// \p{L}\p{M} matches any letter, including accented characters. +var re = regexp.MustCompile(`@[a-z]*\{([\"\p{L}\p{M}\-]*[0-9]{4}[a-z]),`) // Map a cite name (e.g., Doe2024a) to its line number in the .bib file. All // cite names are unique. diff --git a/src/vendor/github.com/nickng/bibtex/token.go b/src/vendor/github.com/nickng/bibtex/token.go index 17c451e..f26e32a 100644 --- a/src/vendor/github.com/nickng/bibtex/token.go +++ b/src/vendor/github.com/nickng/bibtex/token.go @@ -29,8 +29,18 @@ func isWhitespace(ch rune) bool { return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' } +func isAccent(ch rune) bool { + accents := "äöüßéêçñÁÉÍÓÚáéíóúàèìòùâêîôûãõñÄÖÜ" + for _, accent := range accents { + if ch == accent { + return true + } + } + return false +} + func isAlpha(ch rune) bool { - return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') + return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || isAccent(ch) } func isDigit(ch rune) bool {