Files
voidraft/internal/services/document_diff.go

307 lines
6.5 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package services
// Edit 表示编辑操作类型
type EditType int
const (
// EditInsert 插入操作
EditInsert EditType = iota
// EditDelete 删除操作
EditDelete
// EditEqual 相等部分
EditEqual
)
// Edit 表示单个编辑操作
type Edit struct {
Type EditType // 操作类型
Content string // 操作内容
}
// DiffResult 包含差异比较的结果信息
type DiffResult struct {
Edits []Edit // 编辑操作列表
InsertCount int // 插入的字符数
DeleteCount int // 删除的字符数
ChangedLines int // 变更的行数
TotalChanges int // 总变更字符数(插入+删除)
ChangedTokens int // 变更的token数如单词、标识符等
}
// calculateChangesDetailed 使用Myers差分算法计算两个字符串之间的具体变更
func calculateChangesDetailed(oldText, newText string) DiffResult {
// 将文本分割成行
oldLines := splitLines(oldText)
newLines := splitLines(newText)
// 计算行级别的差异
edits := computeLineEdits(oldLines, newLines)
// 计算变更统计
result := DiffResult{
Edits: edits,
}
// 统计变更
for _, edit := range edits {
switch edit.Type {
case EditInsert:
result.InsertCount += len(edit.Content)
result.ChangedLines++
case EditDelete:
result.DeleteCount += len(edit.Content)
result.ChangedLines++
}
}
result.TotalChanges = result.InsertCount + result.DeleteCount
result.ChangedTokens = estimateChangedTokens(edits)
return result
}
// splitLines 将文本分割成行
func splitLines(text string) []string {
var lines []string
var currentLine string
for _, char := range text {
if char == '\n' {
lines = append(lines, currentLine)
currentLine = ""
} else {
currentLine += string(char)
}
}
// 添加最后一行(如果不是以换行符结尾)
if currentLine != "" {
lines = append(lines, currentLine)
}
return lines
}
// computeLineEdits 使用Myers差分算法计算行级别的差异
func computeLineEdits(oldLines, newLines []string) []Edit {
var edits []Edit
// 使用Myers差分算法计算行级别的差异
script := myersDiff(oldLines, newLines)
// 将差异脚本转换为编辑操作
for _, op := range script {
switch op.Type {
case EditEqual:
edits = append(edits, Edit{
Type: EditEqual,
Content: oldLines[op.OldStart],
})
case EditDelete:
edits = append(edits, Edit{
Type: EditDelete,
Content: oldLines[op.OldStart],
})
case EditInsert:
edits = append(edits, Edit{
Type: EditInsert,
Content: newLines[op.NewStart],
})
}
}
return edits
}
// DiffOp 表示差分操作
type DiffOp struct {
Type EditType
OldStart int
OldEnd int
NewStart int
NewEnd int
}
// myersDiff 实现Myers差分算法
func myersDiff(oldLines, newLines []string) []DiffOp {
// 基本思路Myers差分算法通过建立编辑图来寻找最短编辑路径
// 简化版实现
var script []DiffOp
oldLen := len(oldLines)
newLen := len(newLines)
// 使用动态规划找出最长公共子序列(LCS)
lcs := longestCommonSubsequence(oldLines, newLines)
// 根据LCS构建差分脚本
oldIndex, newIndex := 0, 0
for _, entry := range lcs {
// 处理LCS之前的差异
for oldIndex < entry.OldIndex {
script = append(script, DiffOp{
Type: EditDelete,
OldStart: oldIndex,
OldEnd: oldIndex + 1,
NewStart: newIndex,
NewEnd: newIndex,
})
oldIndex++
}
for newIndex < entry.NewIndex {
script = append(script, DiffOp{
Type: EditInsert,
OldStart: oldIndex,
OldEnd: oldIndex,
NewStart: newIndex,
NewEnd: newIndex + 1,
})
newIndex++
}
// 处理相等部分
script = append(script, DiffOp{
Type: EditEqual,
OldStart: oldIndex,
OldEnd: oldIndex + 1,
NewStart: newIndex,
NewEnd: newIndex + 1,
})
oldIndex++
newIndex++
}
// 处理剩余差异
for oldIndex < oldLen {
script = append(script, DiffOp{
Type: EditDelete,
OldStart: oldIndex,
OldEnd: oldIndex + 1,
NewStart: newIndex,
NewEnd: newIndex,
})
oldIndex++
}
for newIndex < newLen {
script = append(script, DiffOp{
Type: EditInsert,
OldStart: oldIndex,
OldEnd: oldIndex,
NewStart: newIndex,
NewEnd: newIndex + 1,
})
newIndex++
}
return script
}
// LCSEntry 表示最长公共子序列中的一个条目
type LCSEntry struct {
OldIndex int
NewIndex int
}
// longestCommonSubsequence 寻找两个字符串数组的最长公共子序列
func longestCommonSubsequence(oldLines, newLines []string) []LCSEntry {
oldLen := len(oldLines)
newLen := len(newLines)
// 创建动态规划表
dp := make([][]int, oldLen+1)
for i := range dp {
dp[i] = make([]int, newLen+1)
}
// 填充DP表
for i := 1; i <= oldLen; i++ {
for j := 1; j <= newLen; j++ {
if oldLines[i-1] == newLines[j-1] {
dp[i][j] = dp[i-1][j-1] + 1
} else {
dp[i][j] = max(dp[i-1][j], dp[i][j-1])
}
}
}
// 回溯找出LCS
var lcs []LCSEntry
i, j := oldLen, newLen
for i > 0 && j > 0 {
if oldLines[i-1] == newLines[j-1] {
lcs = append([]LCSEntry{{OldIndex: i - 1, NewIndex: j - 1}}, lcs...)
i--
j--
} else if dp[i-1][j] > dp[i][j-1] {
i--
} else {
j--
}
}
return lcs
}
// max 返回两个整数中的较大值
func max(a, b int) int {
if a > b {
return a
}
return b
}
// estimateChangedTokens 估计变更的token数量
// 这里使用简单的单词分割来估计
func estimateChangedTokens(edits []Edit) int {
tokenCount := 0
for _, edit := range edits {
switch edit.Type {
case EditInsert, EditDelete:
// 简单地将内容按空白字符分割成单词
words := splitIntoWords(edit.Content)
tokenCount += len(words)
}
}
return tokenCount
}
// splitIntoWords 将文本分割成单词
func splitIntoWords(text string) []string {
var words []string
var currentWord string
// 简单的状态机:
// - 如果是字母、数字或下划线,添加到当前单词
// - 否则,结束当前单词并开始新单词
for _, char := range text {
if isWordChar(char) {
currentWord += string(char)
} else {
if currentWord != "" {
words = append(words, currentWord)
currentWord = ""
}
}
}
// 添加最后一个单词(如果有)
if currentWord != "" {
words = append(words, currentWord)
}
return words
}
// isWordChar 判断字符是否是单词字符(字母、数字或下划线)
func isWordChar(char rune) bool {
return (char >= 'a' && char <= 'z') ||
(char >= 'A' && char <= 'Z') ||
(char >= '0' && char <= '9') ||
char == '_'
}