Files
voidraft/frontend/src/views/editor/extensions/codeblock/language-detection/heuristics.ts
2025-06-18 21:19:19 +08:00

269 lines
6.8 KiB
TypeScript

/**
* 基于启发式规则的语言检测
* 用于快速识别常见的编程语言模式
*/
import { SupportedLanguage } from '../types';
/**
* 语言检测结果
*/
export interface LanguageDetectionResult {
language: SupportedLanguage;
confidence: number;
}
/**
* 语言模式定义
*/
interface LanguagePattern {
patterns: RegExp[];
weight: number;
}
/**
* 语言检测规则映射
*/
const LANGUAGE_PATTERNS: Record<string, LanguagePattern> = {
javascript: {
patterns: [
/\b(function|const|let|var|class|extends|import|export|async|await)\b/g,
/\b(console\.log|document\.|window\.)\b/g,
/=>\s*[{(]/g,
/\b(require|module\.exports)\b/g,
],
weight: 1.0,
},
typescript: {
patterns: [
/\b(interface|type|enum|namespace|implements|declare)\b/g,
/:\s*(string|number|boolean|object|any)\b/g,
/<[A-Z][a-zA-Z0-9<>,\s]*>/g,
/\b(public|private|protected|readonly)\b/g,
],
weight: 1.2,
},
python: {
patterns: [
/\b(def|class|import|from|if __name__|print|len|range)\b/g,
/^\s*#.*$/gm,
/\b(True|False|None)\b/g,
/:\s*$/gm,
],
weight: 1.0,
},
java: {
patterns: [
/\b(public|private|protected|static|final|class|interface)\b/g,
/\b(System\.out\.println|String|int|void)\b/g,
/import\s+[a-zA-Z0-9_.]+;/g,
/\b(extends|implements)\b/g,
],
weight: 1.0,
},
html: {
patterns: [
/<\/?[a-zA-Z][^>]*>/g,
/<!DOCTYPE\s+html>/gi,
/<(div|span|p|h[1-6]|body|head|html)\b/g,
/\s(class|id|src|href)=/g,
],
weight: 1.5,
},
css: {
patterns: [
/[.#][a-zA-Z][\w-]*\s*{/g,
/\b(color|background|margin|padding|font-size):\s*[^;]+;/g,
/@(media|keyframes|import)\b/g,
/\{[^}]*\}/g,
],
weight: 1.3,
},
json: {
patterns: [
/^\s*[{\[][\s\S]*[}\]]\s*$/,
/"[^"]*":\s*(".*"|[\d.]+|true|false|null)/g,
/,\s*$/gm,
],
weight: 2.0,
},
sql: {
patterns: [
/\b(SELECT|FROM|WHERE|INSERT|UPDATE|DELETE|CREATE|ALTER|DROP)\b/gi,
/\b(JOIN|LEFT|RIGHT|INNER|OUTER|ON|GROUP BY|ORDER BY)\b/gi,
/;\s*$/gm,
/\b(TABLE|DATABASE|INDEX)\b/gi,
],
weight: 1.4,
},
shell: {
patterns: [
/^#!/g,
/\b(echo|cd|ls|grep|awk|sed|cat|chmod)\b/g,
/\$\{?\w+\}?/g,
/\|\s*\w+/g,
],
weight: 1.2,
},
markdown: {
patterns: [
/^#+\s+/gm,
/\*\*.*?\*\*/g,
/\[.*?\]\(.*?\)/g,
/^```/gm,
],
weight: 1.1,
},
php: {
patterns: [
/<\?php/g,
/\$\w+/g,
/\b(function|class|extends|implements)\b/g,
/echo\s+/g,
],
weight: 1.3,
},
cpp: {
patterns: [
/#include\s*<.*>/g,
/\b(int|char|float|double|void|class|struct)\b/g,
/std::/g,
/cout\s*<<|cin\s*>>/g,
],
weight: 1.1,
},
rust: {
patterns: [
/\bfn\s+\w+/g,
/\b(let|mut|struct|enum|impl|trait)\b/g,
/println!\(/g,
/::\w+/g,
],
weight: 1.2,
},
go: {
patterns: [
/\bfunc\s+\w+/g,
/\b(var|const|type|package|import)\b/g,
/fmt\.\w+/g,
/:=\s*/g,
],
weight: 1.1,
},
ruby: {
patterns: [
/\b(def|class|module|end)\b/g,
/\b(puts|print|require)\b/g,
/@\w+/g,
/\|\w+\|/g,
],
weight: 1.0,
},
yaml: {
patterns: [
/^\s*\w+:\s*.*$/gm, // key: value 模式
/^\s*-\s+\w+/gm, // 列表项
/^---\s*$/gm, // 文档分隔符
/^\s*\w+:\s*\|/gm, // 多行字符串
/^\s*\w+:\s*>/gm, // 折叠字符串
/^\s*#.*$/gm, // 注释
/:\s*\[.*\]/g, // 内联数组
/:\s*\{.*\}/g, // 内联对象
],
weight: 1.5,
},
xml: {
patterns: [
/<\?xml/g,
/<\/\w+>/g,
/<\w+[^>]*\/>/g,
/\s\w+="[^"]*"/g,
],
weight: 1.3,
},
};
/**
* JSON 特殊检测
* 使用更严格的规则检测 JSON
*/
function detectJSON(content: string): LanguageDetectionResult | null {
const trimmed = content.trim();
if ((trimmed.startsWith('{') && trimmed.endsWith('}')) ||
(trimmed.startsWith('[') && trimmed.endsWith(']'))) {
try {
JSON.parse(trimmed);
return {
language: 'json',
confidence: 1.0,
};
} catch (e) {
// JSON 解析失败,继续其他检测
}
}
return null;
}
/**
* 计算文本与语言模式的匹配分数
*/
function calculateScore(content: string, pattern: LanguagePattern): number {
let score = 0;
const contentLength = Math.max(content.length, 1);
for (const regex of pattern.patterns) {
const matches = content.match(regex);
if (matches) {
score += matches.length;
}
}
// 根据内容长度和权重标准化分数
return (score * pattern.weight) / (contentLength / 100);
}
/**
* 基于启发式规则检测语言
*/
export function detectLanguageHeuristic(content: string): LanguageDetectionResult {
if (!content.trim()) {
return { language: 'text', confidence: 1.0 };
}
// 首先尝试 JSON 特殊检测
const jsonResult = detectJSON(content);
if (jsonResult) {
return jsonResult;
}
const scores: Record<string, number> = {};
// 计算每种语言的匹配分数
for (const [language, pattern] of Object.entries(LANGUAGE_PATTERNS)) {
scores[language] = calculateScore(content, pattern);
}
// 找到最高分的语言
const sortedScores = Object.entries(scores)
.sort(([, a], [, b]) => b - a)
.filter(([, score]) => score > 0);
if (sortedScores.length > 0) {
const [bestLanguage, bestScore] = sortedScores[0];
return {
language: bestLanguage as SupportedLanguage,
confidence: Math.min(bestScore, 1.0),
};
}
return { language: 'text', confidence: 1.0 };
}
/**
* 获取所有支持的检测语言
*/
export function getSupportedDetectionLanguages(): SupportedLanguage[] {
return Object.keys(LANGUAGE_PATTERNS) as SupportedLanguage[];
}