first commit
Some checks failed
Backend Tests / Static Checks (push) Has been cancelled
Backend Tests / Tests (other) (push) Has been cancelled
Backend Tests / Tests (plugin) (push) Has been cancelled
Backend Tests / Tests (server) (push) Has been cancelled
Backend Tests / Tests (store) (push) Has been cancelled
Build Canary Image / build-frontend (push) Has been cancelled
Build Canary Image / build-push (linux/amd64) (push) Has been cancelled
Build Canary Image / build-push (linux/arm64) (push) Has been cancelled
Build Canary Image / merge (push) Has been cancelled
Frontend Tests / Lint (push) Has been cancelled
Frontend Tests / Build (push) Has been cancelled
Proto Linter / Lint Protos (push) Has been cancelled
Some checks failed
Backend Tests / Static Checks (push) Has been cancelled
Backend Tests / Tests (other) (push) Has been cancelled
Backend Tests / Tests (plugin) (push) Has been cancelled
Backend Tests / Tests (server) (push) Has been cancelled
Backend Tests / Tests (store) (push) Has been cancelled
Build Canary Image / build-frontend (push) Has been cancelled
Build Canary Image / build-push (linux/amd64) (push) Has been cancelled
Build Canary Image / build-push (linux/arm64) (push) Has been cancelled
Build Canary Image / merge (push) Has been cancelled
Frontend Tests / Lint (push) Has been cancelled
Frontend Tests / Build (push) Has been cancelled
Proto Linter / Lint Protos (push) Has been cancelled
This commit is contained in:
139
plugin/markdown/parser/tag.go
Normal file
139
plugin/markdown/parser/tag.go
Normal file
@@ -0,0 +1,139 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
gast "github.com/yuin/goldmark/ast"
|
||||
"github.com/yuin/goldmark/parser"
|
||||
"github.com/yuin/goldmark/text"
|
||||
|
||||
mast "github.com/usememos/memos/plugin/markdown/ast"
|
||||
)
|
||||
|
||||
const (
|
||||
// MaxTagLength defines the maximum number of runes allowed in a tag.
|
||||
MaxTagLength = 100
|
||||
)
|
||||
|
||||
type tagParser struct{}
|
||||
|
||||
// NewTagParser creates a new inline parser for #tag syntax.
|
||||
func NewTagParser() parser.InlineParser {
|
||||
return &tagParser{}
|
||||
}
|
||||
|
||||
// Trigger returns the characters that trigger this parser.
|
||||
func (*tagParser) Trigger() []byte {
|
||||
return []byte{'#'}
|
||||
}
|
||||
|
||||
// isValidTagRune checks if a Unicode rune is valid in a tag.
|
||||
// Uses Unicode categories for proper international character support.
|
||||
func isValidTagRune(r rune) bool {
|
||||
// Allow Unicode letters (any script: Latin, CJK, Arabic, Cyrillic, etc.)
|
||||
if unicode.IsLetter(r) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Allow Unicode digits
|
||||
if unicode.IsNumber(r) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Allow emoji and symbols (So category: Symbol, Other)
|
||||
// This includes emoji, which are essential for social media-style tagging
|
||||
if unicode.IsSymbol(r) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Allow specific ASCII symbols for tag structure
|
||||
// Underscore: word separation (snake_case)
|
||||
// Hyphen: word separation (kebab-case)
|
||||
// Forward slash: hierarchical tags (category/subcategory)
|
||||
// Ampersand: compound tags (science&tech)
|
||||
if r == '_' || r == '-' || r == '/' || r == '&' {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// Parse parses #tag syntax using Unicode-aware validation.
|
||||
// Tags support international characters and follow these rules:
|
||||
// - Must start with # followed by valid tag characters
|
||||
// - Valid characters: Unicode letters, Unicode digits, underscore (_), hyphen (-), forward slash (/)
|
||||
// - Maximum length: 100 runes (Unicode characters)
|
||||
// - Stops at: whitespace, punctuation, or other invalid characters
|
||||
func (*tagParser) Parse(_ gast.Node, block text.Reader, _ parser.Context) gast.Node {
|
||||
line, _ := block.PeekLine()
|
||||
|
||||
// Must start with #
|
||||
if len(line) == 0 || line[0] != '#' {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check if it's a heading (## or space after #)
|
||||
if len(line) > 1 {
|
||||
if line[1] == '#' {
|
||||
// It's a heading (##), not a tag
|
||||
return nil
|
||||
}
|
||||
if line[1] == ' ' {
|
||||
// Space after # - heading or just a hash
|
||||
return nil
|
||||
}
|
||||
} else {
|
||||
// Just a lone #
|
||||
return nil
|
||||
}
|
||||
|
||||
// Parse tag using UTF-8 aware rune iteration
|
||||
tagStart := 1
|
||||
pos := tagStart
|
||||
runeCount := 0
|
||||
|
||||
for pos < len(line) {
|
||||
r, size := utf8.DecodeRune(line[pos:])
|
||||
|
||||
// Stop at invalid UTF-8
|
||||
if r == utf8.RuneError && size == 1 {
|
||||
break
|
||||
}
|
||||
|
||||
// Validate character using Unicode categories
|
||||
if !isValidTagRune(r) {
|
||||
break
|
||||
}
|
||||
|
||||
// Enforce max length (by rune count, not byte count)
|
||||
runeCount++
|
||||
if runeCount > MaxTagLength {
|
||||
break
|
||||
}
|
||||
|
||||
pos += size
|
||||
}
|
||||
|
||||
// Must have at least one character after #
|
||||
if pos <= tagStart {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Extract tag (without #)
|
||||
tagName := line[tagStart:pos]
|
||||
|
||||
// Make a copy of the tag name
|
||||
tagCopy := make([]byte, len(tagName))
|
||||
copy(tagCopy, tagName)
|
||||
|
||||
// Advance reader
|
||||
block.Advance(pos)
|
||||
|
||||
// Create node
|
||||
node := &mast.TagNode{
|
||||
Tag: tagCopy,
|
||||
}
|
||||
|
||||
return node
|
||||
}
|
||||
251
plugin/markdown/parser/tag_test.go
Normal file
251
plugin/markdown/parser/tag_test.go
Normal file
@@ -0,0 +1,251 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"github.com/yuin/goldmark/parser"
|
||||
"github.com/yuin/goldmark/text"
|
||||
|
||||
mast "github.com/usememos/memos/plugin/markdown/ast"
|
||||
)
|
||||
|
||||
func TestTagParser(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expectedTag string
|
||||
shouldParse bool
|
||||
}{
|
||||
{
|
||||
name: "basic tag",
|
||||
input: "#tag",
|
||||
expectedTag: "tag",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "tag with hyphen",
|
||||
input: "#work-notes",
|
||||
expectedTag: "work-notes",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "tag with ampersand",
|
||||
input: "#science&tech",
|
||||
expectedTag: "science&tech",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "tag with underscore",
|
||||
input: "#2024_plans",
|
||||
expectedTag: "2024_plans",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "numeric tag",
|
||||
input: "#123",
|
||||
expectedTag: "123",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "tag followed by space",
|
||||
input: "#tag ",
|
||||
expectedTag: "tag",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "tag followed by punctuation",
|
||||
input: "#tag.",
|
||||
expectedTag: "tag",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "tag in sentence",
|
||||
input: "#important task",
|
||||
expectedTag: "important",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "heading (##)",
|
||||
input: "## Heading",
|
||||
expectedTag: "",
|
||||
shouldParse: false,
|
||||
},
|
||||
{
|
||||
name: "space after hash",
|
||||
input: "# heading",
|
||||
expectedTag: "",
|
||||
shouldParse: false,
|
||||
},
|
||||
{
|
||||
name: "lone hash",
|
||||
input: "#",
|
||||
expectedTag: "",
|
||||
shouldParse: false,
|
||||
},
|
||||
{
|
||||
name: "hash with space",
|
||||
input: "# ",
|
||||
expectedTag: "",
|
||||
shouldParse: false,
|
||||
},
|
||||
{
|
||||
name: "special characters",
|
||||
input: "#tag@special",
|
||||
expectedTag: "tag",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "mixed case",
|
||||
input: "#WorkNotes",
|
||||
expectedTag: "WorkNotes",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "hierarchical tag with slash",
|
||||
input: "#tag1/subtag",
|
||||
expectedTag: "tag1/subtag",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "hierarchical tag with multiple levels",
|
||||
input: "#tag1/subtag/subtag2",
|
||||
expectedTag: "tag1/subtag/subtag2",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "hierarchical tag followed by space",
|
||||
input: "#work/notes ",
|
||||
expectedTag: "work/notes",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "hierarchical tag followed by punctuation",
|
||||
input: "#project/2024.",
|
||||
expectedTag: "project/2024",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "hierarchical tag with numbers and dashes",
|
||||
input: "#work-log/2024/q1",
|
||||
expectedTag: "work-log/2024/q1",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "Chinese characters",
|
||||
input: "#测试",
|
||||
expectedTag: "测试",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "Chinese tag followed by space",
|
||||
input: "#测试 some text",
|
||||
expectedTag: "测试",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "Chinese tag followed by punctuation",
|
||||
input: "#测试。",
|
||||
expectedTag: "测试",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "mixed Chinese and ASCII",
|
||||
input: "#测试test123",
|
||||
expectedTag: "测试test123",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "Japanese characters",
|
||||
input: "#テスト",
|
||||
expectedTag: "テスト",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "Korean characters",
|
||||
input: "#테스트",
|
||||
expectedTag: "테스트",
|
||||
shouldParse: true,
|
||||
},
|
||||
{
|
||||
name: "emoji",
|
||||
input: "#test🚀",
|
||||
expectedTag: "test🚀",
|
||||
shouldParse: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
p := NewTagParser()
|
||||
reader := text.NewReader([]byte(tt.input))
|
||||
ctx := parser.NewContext()
|
||||
|
||||
node := p.Parse(nil, reader, ctx)
|
||||
|
||||
if tt.shouldParse {
|
||||
require.NotNil(t, node, "Expected tag to be parsed")
|
||||
require.IsType(t, &mast.TagNode{}, node)
|
||||
|
||||
tagNode, ok := node.(*mast.TagNode)
|
||||
require.True(t, ok, "Expected node to be *mast.TagNode")
|
||||
assert.Equal(t, tt.expectedTag, string(tagNode.Tag))
|
||||
} else {
|
||||
assert.Nil(t, node, "Expected tag NOT to be parsed")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTagParser_Trigger(t *testing.T) {
|
||||
p := NewTagParser()
|
||||
triggers := p.Trigger()
|
||||
|
||||
assert.Equal(t, []byte{'#'}, triggers)
|
||||
}
|
||||
|
||||
func TestTagParser_MultipleTags(t *testing.T) {
|
||||
// Test that parser correctly handles multiple tags in sequence
|
||||
input := "#tag1 #tag2"
|
||||
|
||||
p := NewTagParser()
|
||||
reader := text.NewReader([]byte(input))
|
||||
ctx := parser.NewContext()
|
||||
|
||||
// Parse first tag
|
||||
node1 := p.Parse(nil, reader, ctx)
|
||||
require.NotNil(t, node1)
|
||||
tagNode1, ok := node1.(*mast.TagNode)
|
||||
require.True(t, ok, "Expected node1 to be *mast.TagNode")
|
||||
assert.Equal(t, "tag1", string(tagNode1.Tag))
|
||||
|
||||
// Advance past the space
|
||||
reader.Advance(1)
|
||||
|
||||
// Parse second tag
|
||||
node2 := p.Parse(nil, reader, ctx)
|
||||
require.NotNil(t, node2)
|
||||
tagNode2, ok := node2.(*mast.TagNode)
|
||||
require.True(t, ok, "Expected node2 to be *mast.TagNode")
|
||||
assert.Equal(t, "tag2", string(tagNode2.Tag))
|
||||
}
|
||||
|
||||
func TestTagNode_Kind(t *testing.T) {
|
||||
node := &mast.TagNode{
|
||||
Tag: []byte("test"),
|
||||
}
|
||||
|
||||
assert.Equal(t, mast.KindTag, node.Kind())
|
||||
}
|
||||
|
||||
func TestTagNode_Dump(t *testing.T) {
|
||||
node := &mast.TagNode{
|
||||
Tag: []byte("test"),
|
||||
}
|
||||
|
||||
// Should not panic
|
||||
assert.NotPanics(t, func() {
|
||||
node.Dump([]byte("#test"), 0)
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user