307 lines
6.5 KiB
Go
307 lines
6.5 KiB
Go
package services
|
||
|
||
// Edit 表示编辑操作类型
|
||
type EditType int
|
||
|
||
const (
|
||
// EditInsert 插入操作
|
||
EditInsert EditType = iota
|
||
// EditDelete 删除操作
|
||
EditDelete
|
||
// EditEqual 相等部分
|
||
EditEqual
|
||
)
|
||
|
||
// Edit 表示单个编辑操作
|
||
type Edit struct {
|
||
Type EditType // 操作类型
|
||
Content string // 操作内容
|
||
}
|
||
|
||
// DiffResult 包含差异比较的结果信息
|
||
type DiffResult struct {
|
||
Edits []Edit // 编辑操作列表
|
||
InsertCount int // 插入的字符数
|
||
DeleteCount int // 删除的字符数
|
||
ChangedLines int // 变更的行数
|
||
TotalChanges int // 总变更字符数(插入+删除)
|
||
ChangedTokens int // 变更的token数(如单词、标识符等)
|
||
}
|
||
|
||
// calculateChangesDetailed 使用Myers差分算法计算两个字符串之间的具体变更
|
||
func calculateChangesDetailed(oldText, newText string) DiffResult {
|
||
// 将文本分割成行
|
||
oldLines := splitLines(oldText)
|
||
newLines := splitLines(newText)
|
||
|
||
// 计算行级别的差异
|
||
edits := computeLineEdits(oldLines, newLines)
|
||
|
||
// 计算变更统计
|
||
result := DiffResult{
|
||
Edits: edits,
|
||
}
|
||
|
||
// 统计变更
|
||
for _, edit := range edits {
|
||
switch edit.Type {
|
||
case EditInsert:
|
||
result.InsertCount += len(edit.Content)
|
||
result.ChangedLines++
|
||
case EditDelete:
|
||
result.DeleteCount += len(edit.Content)
|
||
result.ChangedLines++
|
||
}
|
||
}
|
||
|
||
result.TotalChanges = result.InsertCount + result.DeleteCount
|
||
result.ChangedTokens = estimateChangedTokens(edits)
|
||
|
||
return result
|
||
}
|
||
|
||
// splitLines 将文本分割成行
|
||
func splitLines(text string) []string {
|
||
var lines []string
|
||
var currentLine string
|
||
|
||
for _, char := range text {
|
||
if char == '\n' {
|
||
lines = append(lines, currentLine)
|
||
currentLine = ""
|
||
} else {
|
||
currentLine += string(char)
|
||
}
|
||
}
|
||
|
||
// 添加最后一行(如果不是以换行符结尾)
|
||
if currentLine != "" {
|
||
lines = append(lines, currentLine)
|
||
}
|
||
|
||
return lines
|
||
}
|
||
|
||
// computeLineEdits 使用Myers差分算法计算行级别的差异
|
||
func computeLineEdits(oldLines, newLines []string) []Edit {
|
||
var edits []Edit
|
||
|
||
// 使用Myers差分算法计算行级别的差异
|
||
script := myersDiff(oldLines, newLines)
|
||
|
||
// 将差异脚本转换为编辑操作
|
||
for _, op := range script {
|
||
switch op.Type {
|
||
case EditEqual:
|
||
edits = append(edits, Edit{
|
||
Type: EditEqual,
|
||
Content: oldLines[op.OldStart],
|
||
})
|
||
case EditDelete:
|
||
edits = append(edits, Edit{
|
||
Type: EditDelete,
|
||
Content: oldLines[op.OldStart],
|
||
})
|
||
case EditInsert:
|
||
edits = append(edits, Edit{
|
||
Type: EditInsert,
|
||
Content: newLines[op.NewStart],
|
||
})
|
||
}
|
||
}
|
||
|
||
return edits
|
||
}
|
||
|
||
// DiffOp 表示差分操作
|
||
type DiffOp struct {
|
||
Type EditType
|
||
OldStart int
|
||
OldEnd int
|
||
NewStart int
|
||
NewEnd int
|
||
}
|
||
|
||
// myersDiff 实现Myers差分算法
|
||
func myersDiff(oldLines, newLines []string) []DiffOp {
|
||
// 基本思路:Myers差分算法通过建立编辑图来寻找最短编辑路径
|
||
// 简化版实现
|
||
var script []DiffOp
|
||
|
||
oldLen := len(oldLines)
|
||
newLen := len(newLines)
|
||
|
||
// 使用动态规划找出最长公共子序列(LCS)
|
||
lcs := longestCommonSubsequence(oldLines, newLines)
|
||
|
||
// 根据LCS构建差分脚本
|
||
oldIndex, newIndex := 0, 0
|
||
for _, entry := range lcs {
|
||
// 处理LCS之前的差异
|
||
for oldIndex < entry.OldIndex {
|
||
script = append(script, DiffOp{
|
||
Type: EditDelete,
|
||
OldStart: oldIndex,
|
||
OldEnd: oldIndex + 1,
|
||
NewStart: newIndex,
|
||
NewEnd: newIndex,
|
||
})
|
||
oldIndex++
|
||
}
|
||
|
||
for newIndex < entry.NewIndex {
|
||
script = append(script, DiffOp{
|
||
Type: EditInsert,
|
||
OldStart: oldIndex,
|
||
OldEnd: oldIndex,
|
||
NewStart: newIndex,
|
||
NewEnd: newIndex + 1,
|
||
})
|
||
newIndex++
|
||
}
|
||
|
||
// 处理相等部分
|
||
script = append(script, DiffOp{
|
||
Type: EditEqual,
|
||
OldStart: oldIndex,
|
||
OldEnd: oldIndex + 1,
|
||
NewStart: newIndex,
|
||
NewEnd: newIndex + 1,
|
||
})
|
||
|
||
oldIndex++
|
||
newIndex++
|
||
}
|
||
|
||
// 处理剩余差异
|
||
for oldIndex < oldLen {
|
||
script = append(script, DiffOp{
|
||
Type: EditDelete,
|
||
OldStart: oldIndex,
|
||
OldEnd: oldIndex + 1,
|
||
NewStart: newIndex,
|
||
NewEnd: newIndex,
|
||
})
|
||
oldIndex++
|
||
}
|
||
|
||
for newIndex < newLen {
|
||
script = append(script, DiffOp{
|
||
Type: EditInsert,
|
||
OldStart: oldIndex,
|
||
OldEnd: oldIndex,
|
||
NewStart: newIndex,
|
||
NewEnd: newIndex + 1,
|
||
})
|
||
newIndex++
|
||
}
|
||
|
||
return script
|
||
}
|
||
|
||
// LCSEntry 表示最长公共子序列中的一个条目
|
||
type LCSEntry struct {
|
||
OldIndex int
|
||
NewIndex int
|
||
}
|
||
|
||
// longestCommonSubsequence 寻找两个字符串数组的最长公共子序列
|
||
func longestCommonSubsequence(oldLines, newLines []string) []LCSEntry {
|
||
oldLen := len(oldLines)
|
||
newLen := len(newLines)
|
||
|
||
// 创建动态规划表
|
||
dp := make([][]int, oldLen+1)
|
||
for i := range dp {
|
||
dp[i] = make([]int, newLen+1)
|
||
}
|
||
|
||
// 填充DP表
|
||
for i := 1; i <= oldLen; i++ {
|
||
for j := 1; j <= newLen; j++ {
|
||
if oldLines[i-1] == newLines[j-1] {
|
||
dp[i][j] = dp[i-1][j-1] + 1
|
||
} else {
|
||
dp[i][j] = max(dp[i-1][j], dp[i][j-1])
|
||
}
|
||
}
|
||
}
|
||
|
||
// 回溯找出LCS
|
||
var lcs []LCSEntry
|
||
i, j := oldLen, newLen
|
||
for i > 0 && j > 0 {
|
||
if oldLines[i-1] == newLines[j-1] {
|
||
lcs = append([]LCSEntry{{OldIndex: i - 1, NewIndex: j - 1}}, lcs...)
|
||
i--
|
||
j--
|
||
} else if dp[i-1][j] > dp[i][j-1] {
|
||
i--
|
||
} else {
|
||
j--
|
||
}
|
||
}
|
||
|
||
return lcs
|
||
}
|
||
|
||
// max 返回两个整数中的较大值
|
||
func max(a, b int) int {
|
||
if a > b {
|
||
return a
|
||
}
|
||
return b
|
||
}
|
||
|
||
// estimateChangedTokens 估计变更的token数量
|
||
// 这里使用简单的单词分割来估计
|
||
func estimateChangedTokens(edits []Edit) int {
|
||
tokenCount := 0
|
||
|
||
for _, edit := range edits {
|
||
switch edit.Type {
|
||
case EditInsert, EditDelete:
|
||
// 简单地将内容按空白字符分割成单词
|
||
words := splitIntoWords(edit.Content)
|
||
tokenCount += len(words)
|
||
}
|
||
}
|
||
|
||
return tokenCount
|
||
}
|
||
|
||
// splitIntoWords 将文本分割成单词
|
||
func splitIntoWords(text string) []string {
|
||
var words []string
|
||
var currentWord string
|
||
|
||
// 简单的状态机:
|
||
// - 如果是字母、数字或下划线,添加到当前单词
|
||
// - 否则,结束当前单词并开始新单词
|
||
for _, char := range text {
|
||
if isWordChar(char) {
|
||
currentWord += string(char)
|
||
} else {
|
||
if currentWord != "" {
|
||
words = append(words, currentWord)
|
||
currentWord = ""
|
||
}
|
||
}
|
||
}
|
||
|
||
// 添加最后一个单词(如果有)
|
||
if currentWord != "" {
|
||
words = append(words, currentWord)
|
||
}
|
||
|
||
return words
|
||
}
|
||
|
||
// isWordChar 判断字符是否是单词字符(字母、数字或下划线)
|
||
func isWordChar(char rune) bool {
|
||
return (char >= 'a' && char <= 'z') ||
|
||
(char >= 'A' && char <= 'Z') ||
|
||
(char >= '0' && char <= '9') ||
|
||
char == '_'
|
||
}
|