✨ Complete the document saving service
This commit is contained in:
306
internal/services/document_diff.go
Normal file
306
internal/services/document_diff.go
Normal file
@@ -0,0 +1,306 @@
|
||||
package services
|
||||
|
||||
// Edit 表示编辑操作类型
|
||||
type EditType int
|
||||
|
||||
const (
|
||||
// EditInsert 插入操作
|
||||
EditInsert EditType = iota
|
||||
// EditDelete 删除操作
|
||||
EditDelete
|
||||
// EditEqual 相等部分
|
||||
EditEqual
|
||||
)
|
||||
|
||||
// Edit 表示单个编辑操作
|
||||
type Edit struct {
|
||||
Type EditType // 操作类型
|
||||
Content string // 操作内容
|
||||
}
|
||||
|
||||
// DiffResult 包含差异比较的结果信息
|
||||
type DiffResult struct {
|
||||
Edits []Edit // 编辑操作列表
|
||||
InsertCount int // 插入的字符数
|
||||
DeleteCount int // 删除的字符数
|
||||
ChangedLines int // 变更的行数
|
||||
TotalChanges int // 总变更字符数(插入+删除)
|
||||
ChangedTokens int // 变更的token数(如单词、标识符等)
|
||||
}
|
||||
|
||||
// calculateChangesDetailed 使用Myers差分算法计算两个字符串之间的具体变更
|
||||
func calculateChangesDetailed(oldText, newText string) DiffResult {
|
||||
// 将文本分割成行
|
||||
oldLines := splitLines(oldText)
|
||||
newLines := splitLines(newText)
|
||||
|
||||
// 计算行级别的差异
|
||||
edits := computeLineEdits(oldLines, newLines)
|
||||
|
||||
// 计算变更统计
|
||||
result := DiffResult{
|
||||
Edits: edits,
|
||||
}
|
||||
|
||||
// 统计变更
|
||||
for _, edit := range edits {
|
||||
switch edit.Type {
|
||||
case EditInsert:
|
||||
result.InsertCount += len(edit.Content)
|
||||
result.ChangedLines++
|
||||
case EditDelete:
|
||||
result.DeleteCount += len(edit.Content)
|
||||
result.ChangedLines++
|
||||
}
|
||||
}
|
||||
|
||||
result.TotalChanges = result.InsertCount + result.DeleteCount
|
||||
result.ChangedTokens = estimateChangedTokens(edits)
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// splitLines 将文本分割成行
|
||||
func splitLines(text string) []string {
|
||||
var lines []string
|
||||
var currentLine string
|
||||
|
||||
for _, char := range text {
|
||||
if char == '\n' {
|
||||
lines = append(lines, currentLine)
|
||||
currentLine = ""
|
||||
} else {
|
||||
currentLine += string(char)
|
||||
}
|
||||
}
|
||||
|
||||
// 添加最后一行(如果不是以换行符结尾)
|
||||
if currentLine != "" {
|
||||
lines = append(lines, currentLine)
|
||||
}
|
||||
|
||||
return lines
|
||||
}
|
||||
|
||||
// computeLineEdits 使用Myers差分算法计算行级别的差异
|
||||
func computeLineEdits(oldLines, newLines []string) []Edit {
|
||||
var edits []Edit
|
||||
|
||||
// 使用Myers差分算法计算行级别的差异
|
||||
script := myersDiff(oldLines, newLines)
|
||||
|
||||
// 将差异脚本转换为编辑操作
|
||||
for _, op := range script {
|
||||
switch op.Type {
|
||||
case EditEqual:
|
||||
edits = append(edits, Edit{
|
||||
Type: EditEqual,
|
||||
Content: oldLines[op.OldStart],
|
||||
})
|
||||
case EditDelete:
|
||||
edits = append(edits, Edit{
|
||||
Type: EditDelete,
|
||||
Content: oldLines[op.OldStart],
|
||||
})
|
||||
case EditInsert:
|
||||
edits = append(edits, Edit{
|
||||
Type: EditInsert,
|
||||
Content: newLines[op.NewStart],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return edits
|
||||
}
|
||||
|
||||
// DiffOp 表示差分操作
|
||||
type DiffOp struct {
|
||||
Type EditType
|
||||
OldStart int
|
||||
OldEnd int
|
||||
NewStart int
|
||||
NewEnd int
|
||||
}
|
||||
|
||||
// myersDiff 实现Myers差分算法
|
||||
func myersDiff(oldLines, newLines []string) []DiffOp {
|
||||
// 基本思路:Myers差分算法通过建立编辑图来寻找最短编辑路径
|
||||
// 简化版实现
|
||||
var script []DiffOp
|
||||
|
||||
oldLen := len(oldLines)
|
||||
newLen := len(newLines)
|
||||
|
||||
// 使用动态规划找出最长公共子序列(LCS)
|
||||
lcs := longestCommonSubsequence(oldLines, newLines)
|
||||
|
||||
// 根据LCS构建差分脚本
|
||||
oldIndex, newIndex := 0, 0
|
||||
for _, entry := range lcs {
|
||||
// 处理LCS之前的差异
|
||||
for oldIndex < entry.OldIndex {
|
||||
script = append(script, DiffOp{
|
||||
Type: EditDelete,
|
||||
OldStart: oldIndex,
|
||||
OldEnd: oldIndex + 1,
|
||||
NewStart: newIndex,
|
||||
NewEnd: newIndex,
|
||||
})
|
||||
oldIndex++
|
||||
}
|
||||
|
||||
for newIndex < entry.NewIndex {
|
||||
script = append(script, DiffOp{
|
||||
Type: EditInsert,
|
||||
OldStart: oldIndex,
|
||||
OldEnd: oldIndex,
|
||||
NewStart: newIndex,
|
||||
NewEnd: newIndex + 1,
|
||||
})
|
||||
newIndex++
|
||||
}
|
||||
|
||||
// 处理相等部分
|
||||
script = append(script, DiffOp{
|
||||
Type: EditEqual,
|
||||
OldStart: oldIndex,
|
||||
OldEnd: oldIndex + 1,
|
||||
NewStart: newIndex,
|
||||
NewEnd: newIndex + 1,
|
||||
})
|
||||
|
||||
oldIndex++
|
||||
newIndex++
|
||||
}
|
||||
|
||||
// 处理剩余差异
|
||||
for oldIndex < oldLen {
|
||||
script = append(script, DiffOp{
|
||||
Type: EditDelete,
|
||||
OldStart: oldIndex,
|
||||
OldEnd: oldIndex + 1,
|
||||
NewStart: newIndex,
|
||||
NewEnd: newIndex,
|
||||
})
|
||||
oldIndex++
|
||||
}
|
||||
|
||||
for newIndex < newLen {
|
||||
script = append(script, DiffOp{
|
||||
Type: EditInsert,
|
||||
OldStart: oldIndex,
|
||||
OldEnd: oldIndex,
|
||||
NewStart: newIndex,
|
||||
NewEnd: newIndex + 1,
|
||||
})
|
||||
newIndex++
|
||||
}
|
||||
|
||||
return script
|
||||
}
|
||||
|
||||
// LCSEntry 表示最长公共子序列中的一个条目
|
||||
type LCSEntry struct {
|
||||
OldIndex int
|
||||
NewIndex int
|
||||
}
|
||||
|
||||
// longestCommonSubsequence 寻找两个字符串数组的最长公共子序列
|
||||
func longestCommonSubsequence(oldLines, newLines []string) []LCSEntry {
|
||||
oldLen := len(oldLines)
|
||||
newLen := len(newLines)
|
||||
|
||||
// 创建动态规划表
|
||||
dp := make([][]int, oldLen+1)
|
||||
for i := range dp {
|
||||
dp[i] = make([]int, newLen+1)
|
||||
}
|
||||
|
||||
// 填充DP表
|
||||
for i := 1; i <= oldLen; i++ {
|
||||
for j := 1; j <= newLen; j++ {
|
||||
if oldLines[i-1] == newLines[j-1] {
|
||||
dp[i][j] = dp[i-1][j-1] + 1
|
||||
} else {
|
||||
dp[i][j] = max(dp[i-1][j], dp[i][j-1])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 回溯找出LCS
|
||||
var lcs []LCSEntry
|
||||
i, j := oldLen, newLen
|
||||
for i > 0 && j > 0 {
|
||||
if oldLines[i-1] == newLines[j-1] {
|
||||
lcs = append([]LCSEntry{{OldIndex: i - 1, NewIndex: j - 1}}, lcs...)
|
||||
i--
|
||||
j--
|
||||
} else if dp[i-1][j] > dp[i][j-1] {
|
||||
i--
|
||||
} else {
|
||||
j--
|
||||
}
|
||||
}
|
||||
|
||||
return lcs
|
||||
}
|
||||
|
||||
// max 返回两个整数中的较大值
|
||||
func max(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// estimateChangedTokens 估计变更的token数量
|
||||
// 这里使用简单的单词分割来估计
|
||||
func estimateChangedTokens(edits []Edit) int {
|
||||
tokenCount := 0
|
||||
|
||||
for _, edit := range edits {
|
||||
switch edit.Type {
|
||||
case EditInsert, EditDelete:
|
||||
// 简单地将内容按空白字符分割成单词
|
||||
words := splitIntoWords(edit.Content)
|
||||
tokenCount += len(words)
|
||||
}
|
||||
}
|
||||
|
||||
return tokenCount
|
||||
}
|
||||
|
||||
// splitIntoWords 将文本分割成单词
|
||||
func splitIntoWords(text string) []string {
|
||||
var words []string
|
||||
var currentWord string
|
||||
|
||||
// 简单的状态机:
|
||||
// - 如果是字母、数字或下划线,添加到当前单词
|
||||
// - 否则,结束当前单词并开始新单词
|
||||
for _, char := range text {
|
||||
if isWordChar(char) {
|
||||
currentWord += string(char)
|
||||
} else {
|
||||
if currentWord != "" {
|
||||
words = append(words, currentWord)
|
||||
currentWord = ""
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 添加最后一个单词(如果有)
|
||||
if currentWord != "" {
|
||||
words = append(words, currentWord)
|
||||
}
|
||||
|
||||
return words
|
||||
}
|
||||
|
||||
// isWordChar 判断字符是否是单词字符(字母、数字或下划线)
|
||||
func isWordChar(char rune) bool {
|
||||
return (char >= 'a' && char <= 'z') ||
|
||||
(char >= 'A' && char <= 'Z') ||
|
||||
(char >= '0' && char <= '9') ||
|
||||
char == '_'
|
||||
}
|
Reference in New Issue
Block a user