Complete the document saving service

This commit is contained in:
2025-05-17 15:50:34 +08:00
parent bd0bbc9674
commit 1246166231
16 changed files with 1781 additions and 30 deletions

View File

@@ -0,0 +1,306 @@
package services
// Edit 表示编辑操作类型
type EditType int
const (
// EditInsert 插入操作
EditInsert EditType = iota
// EditDelete 删除操作
EditDelete
// EditEqual 相等部分
EditEqual
)
// Edit 表示单个编辑操作
type Edit struct {
Type EditType // 操作类型
Content string // 操作内容
}
// DiffResult 包含差异比较的结果信息
type DiffResult struct {
Edits []Edit // 编辑操作列表
InsertCount int // 插入的字符数
DeleteCount int // 删除的字符数
ChangedLines int // 变更的行数
TotalChanges int // 总变更字符数(插入+删除)
ChangedTokens int // 变更的token数如单词、标识符等
}
// calculateChangesDetailed 使用Myers差分算法计算两个字符串之间的具体变更
func calculateChangesDetailed(oldText, newText string) DiffResult {
// 将文本分割成行
oldLines := splitLines(oldText)
newLines := splitLines(newText)
// 计算行级别的差异
edits := computeLineEdits(oldLines, newLines)
// 计算变更统计
result := DiffResult{
Edits: edits,
}
// 统计变更
for _, edit := range edits {
switch edit.Type {
case EditInsert:
result.InsertCount += len(edit.Content)
result.ChangedLines++
case EditDelete:
result.DeleteCount += len(edit.Content)
result.ChangedLines++
}
}
result.TotalChanges = result.InsertCount + result.DeleteCount
result.ChangedTokens = estimateChangedTokens(edits)
return result
}
// splitLines 将文本分割成行
func splitLines(text string) []string {
var lines []string
var currentLine string
for _, char := range text {
if char == '\n' {
lines = append(lines, currentLine)
currentLine = ""
} else {
currentLine += string(char)
}
}
// 添加最后一行(如果不是以换行符结尾)
if currentLine != "" {
lines = append(lines, currentLine)
}
return lines
}
// computeLineEdits 使用Myers差分算法计算行级别的差异
func computeLineEdits(oldLines, newLines []string) []Edit {
var edits []Edit
// 使用Myers差分算法计算行级别的差异
script := myersDiff(oldLines, newLines)
// 将差异脚本转换为编辑操作
for _, op := range script {
switch op.Type {
case EditEqual:
edits = append(edits, Edit{
Type: EditEqual,
Content: oldLines[op.OldStart],
})
case EditDelete:
edits = append(edits, Edit{
Type: EditDelete,
Content: oldLines[op.OldStart],
})
case EditInsert:
edits = append(edits, Edit{
Type: EditInsert,
Content: newLines[op.NewStart],
})
}
}
return edits
}
// DiffOp 表示差分操作
type DiffOp struct {
Type EditType
OldStart int
OldEnd int
NewStart int
NewEnd int
}
// myersDiff 实现Myers差分算法
func myersDiff(oldLines, newLines []string) []DiffOp {
// 基本思路Myers差分算法通过建立编辑图来寻找最短编辑路径
// 简化版实现
var script []DiffOp
oldLen := len(oldLines)
newLen := len(newLines)
// 使用动态规划找出最长公共子序列(LCS)
lcs := longestCommonSubsequence(oldLines, newLines)
// 根据LCS构建差分脚本
oldIndex, newIndex := 0, 0
for _, entry := range lcs {
// 处理LCS之前的差异
for oldIndex < entry.OldIndex {
script = append(script, DiffOp{
Type: EditDelete,
OldStart: oldIndex,
OldEnd: oldIndex + 1,
NewStart: newIndex,
NewEnd: newIndex,
})
oldIndex++
}
for newIndex < entry.NewIndex {
script = append(script, DiffOp{
Type: EditInsert,
OldStart: oldIndex,
OldEnd: oldIndex,
NewStart: newIndex,
NewEnd: newIndex + 1,
})
newIndex++
}
// 处理相等部分
script = append(script, DiffOp{
Type: EditEqual,
OldStart: oldIndex,
OldEnd: oldIndex + 1,
NewStart: newIndex,
NewEnd: newIndex + 1,
})
oldIndex++
newIndex++
}
// 处理剩余差异
for oldIndex < oldLen {
script = append(script, DiffOp{
Type: EditDelete,
OldStart: oldIndex,
OldEnd: oldIndex + 1,
NewStart: newIndex,
NewEnd: newIndex,
})
oldIndex++
}
for newIndex < newLen {
script = append(script, DiffOp{
Type: EditInsert,
OldStart: oldIndex,
OldEnd: oldIndex,
NewStart: newIndex,
NewEnd: newIndex + 1,
})
newIndex++
}
return script
}
// LCSEntry 表示最长公共子序列中的一个条目
type LCSEntry struct {
OldIndex int
NewIndex int
}
// longestCommonSubsequence 寻找两个字符串数组的最长公共子序列
func longestCommonSubsequence(oldLines, newLines []string) []LCSEntry {
oldLen := len(oldLines)
newLen := len(newLines)
// 创建动态规划表
dp := make([][]int, oldLen+1)
for i := range dp {
dp[i] = make([]int, newLen+1)
}
// 填充DP表
for i := 1; i <= oldLen; i++ {
for j := 1; j <= newLen; j++ {
if oldLines[i-1] == newLines[j-1] {
dp[i][j] = dp[i-1][j-1] + 1
} else {
dp[i][j] = max(dp[i-1][j], dp[i][j-1])
}
}
}
// 回溯找出LCS
var lcs []LCSEntry
i, j := oldLen, newLen
for i > 0 && j > 0 {
if oldLines[i-1] == newLines[j-1] {
lcs = append([]LCSEntry{{OldIndex: i - 1, NewIndex: j - 1}}, lcs...)
i--
j--
} else if dp[i-1][j] > dp[i][j-1] {
i--
} else {
j--
}
}
return lcs
}
// max 返回两个整数中的较大值
func max(a, b int) int {
if a > b {
return a
}
return b
}
// estimateChangedTokens 估计变更的token数量
// 这里使用简单的单词分割来估计
func estimateChangedTokens(edits []Edit) int {
tokenCount := 0
for _, edit := range edits {
switch edit.Type {
case EditInsert, EditDelete:
// 简单地将内容按空白字符分割成单词
words := splitIntoWords(edit.Content)
tokenCount += len(words)
}
}
return tokenCount
}
// splitIntoWords 将文本分割成单词
func splitIntoWords(text string) []string {
var words []string
var currentWord string
// 简单的状态机:
// - 如果是字母、数字或下划线,添加到当前单词
// - 否则,结束当前单词并开始新单词
for _, char := range text {
if isWordChar(char) {
currentWord += string(char)
} else {
if currentWord != "" {
words = append(words, currentWord)
currentWord = ""
}
}
}
// 添加最后一个单词(如果有)
if currentWord != "" {
words = append(words, currentWord)
}
return words
}
// isWordChar 判断字符是否是单词字符(字母、数字或下划线)
func isWordChar(char rune) bool {
return (char >= 'a' && char <= 'z') ||
(char >= 'A' && char <= 'Z') ||
(char >= '0' && char <= '9') ||
char == '_'
}