183 lines
5.4 KiB
TypeScript
183 lines
5.4 KiB
TypeScript
/**
|
||
* Unicode utilities for Scala parser
|
||
* Handles Unicode normalization and character validation
|
||
*/
|
||
|
||
/**
|
||
* Normalizes Unicode strings using NFC (Canonical Decomposition, followed by Canonical Composition)
|
||
* This ensures consistent representation of Unicode characters.
|
||
*
|
||
* @param text - The input text to normalize
|
||
* @returns The normalized text
|
||
*/
|
||
export function normalizeUnicode(text: string): string {
|
||
return text.normalize("NFC");
|
||
}
|
||
|
||
/**
|
||
* Checks if a character is a valid Scala identifier start character
|
||
* Follows Unicode identifier specification for Scala
|
||
*
|
||
* @param char - The character to check
|
||
* @returns True if the character can start an identifier
|
||
*/
|
||
export function isIdentifierStart(char: string): boolean {
|
||
if (char.length !== 1) return false;
|
||
|
||
const codePoint = char.codePointAt(0);
|
||
if (codePoint === undefined) return false;
|
||
|
||
// Basic ASCII identifier characters
|
||
if (
|
||
(codePoint >= 0x41 && codePoint <= 0x5a) || // A-Z
|
||
(codePoint >= 0x61 && codePoint <= 0x7a) || // a-z
|
||
codePoint === 0x5f || // _
|
||
codePoint === 0x24
|
||
) {
|
||
// $
|
||
return true;
|
||
}
|
||
|
||
// Mathematical symbols range (extended)
|
||
if (
|
||
(codePoint >= 0x2200 && codePoint <= 0x22ff) || // Mathematical Operators
|
||
(codePoint >= 0x27c0 && codePoint <= 0x27ef) || // Miscellaneous Mathematical Symbols-A
|
||
(codePoint >= 0x2980 && codePoint <= 0x29ff) || // Miscellaneous Mathematical Symbols-B
|
||
(codePoint >= 0x2a00 && codePoint <= 0x2aff)
|
||
) {
|
||
// Supplemental Mathematical Operators
|
||
return true;
|
||
}
|
||
|
||
// Use Unicode property test for other characters (excluding digits for start characters)
|
||
const testRegex = /\p{L}|\p{Mn}|\p{Mc}|\p{Pc}/u;
|
||
return testRegex.test(char);
|
||
}
|
||
|
||
/**
|
||
* Checks if a character is a valid Scala identifier continuation character
|
||
*
|
||
* @param char - The character to check
|
||
* @returns True if the character can continue an identifier
|
||
*/
|
||
export function isIdentifierContinue(char: string): boolean {
|
||
if (char.length !== 1) return false;
|
||
|
||
const codePoint = char.codePointAt(0);
|
||
if (codePoint === undefined) return false;
|
||
|
||
// Basic ASCII identifier characters
|
||
if (
|
||
(codePoint >= 0x41 && codePoint <= 0x5a) || // A-Z
|
||
(codePoint >= 0x61 && codePoint <= 0x7a) || // a-z
|
||
(codePoint >= 0x30 && codePoint <= 0x39) || // 0-9
|
||
codePoint === 0x5f || // _
|
||
codePoint === 0x24
|
||
) {
|
||
// $
|
||
return true;
|
||
}
|
||
|
||
// Mathematical symbols range (extended)
|
||
if (
|
||
(codePoint >= 0x2200 && codePoint <= 0x22ff) || // Mathematical Operators
|
||
(codePoint >= 0x27c0 && codePoint <= 0x27ef) || // Miscellaneous Mathematical Symbols-A
|
||
(codePoint >= 0x2980 && codePoint <= 0x29ff) || // Miscellaneous Mathematical Symbols-B
|
||
(codePoint >= 0x2a00 && codePoint <= 0x2aff)
|
||
) {
|
||
// Supplemental Mathematical Operators
|
||
return true;
|
||
}
|
||
|
||
// Use Unicode property test for other characters (including format characters)
|
||
const testRegex = /\p{L}|\p{Mn}|\p{Mc}|\p{Nd}|\p{Pc}|\p{Cf}/u;
|
||
return testRegex.test(char);
|
||
}
|
||
|
||
/**
|
||
* Validates that a string is a valid Scala identifier
|
||
*
|
||
* @param identifier - The identifier string to validate
|
||
* @returns True if the string is a valid identifier
|
||
*/
|
||
export function isValidIdentifier(identifier: string): boolean {
|
||
if (!identifier || identifier.length === 0) return false;
|
||
|
||
// Normalize the identifier
|
||
const normalized = normalizeUnicode(identifier);
|
||
|
||
// Check first character
|
||
if (!isIdentifierStart(normalized[0])) return false;
|
||
|
||
// Check remaining characters
|
||
for (let i = 1; i < normalized.length; i++) {
|
||
if (!isIdentifierContinue(normalized[i])) return false;
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* Converts Unicode escape sequences in strings to actual Unicode characters
|
||
* Handles \uXXXX patterns in string literals
|
||
*
|
||
* @param text - The text containing Unicode escapes
|
||
* @returns The text with Unicode escapes converted to actual characters
|
||
*/
|
||
export function processUnicodeEscapes(text: string): string {
|
||
return text.replace(/\\u([0-9A-Fa-f]{4})/g, (_, hex) => {
|
||
const codePoint = parseInt(hex, 16);
|
||
return String.fromCharCode(codePoint);
|
||
});
|
||
}
|
||
|
||
/**
|
||
* Escapes Unicode characters in strings for safe output
|
||
* Converts non-ASCII characters back to \uXXXX format if needed
|
||
*
|
||
* @param text - The text to escape
|
||
* @param escapeNonAscii - Whether to escape all non-ASCII characters
|
||
* @returns The escaped text
|
||
*/
|
||
export function escapeUnicode(text: string, escapeNonAscii = false): string {
|
||
if (!escapeNonAscii) return text;
|
||
|
||
return text.replace(/[\u0080-\uFFFF]/g, (char) => {
|
||
const codePoint = char.charCodeAt(0);
|
||
return `\\u${codePoint.toString(16).padStart(4, "0").toUpperCase()}`;
|
||
});
|
||
}
|
||
|
||
/**
|
||
* Extended mathematical symbols commonly used in Scala functional programming
|
||
*/
|
||
export const MATHEMATICAL_SYMBOLS = {
|
||
// Greek letters commonly used in functional programming
|
||
ALPHA: "α", // U+03B1
|
||
BETA: "β", // U+03B2
|
||
GAMMA: "γ", // U+03B3
|
||
DELTA: "δ", // U+03B4
|
||
LAMBDA: "λ", // U+03BB
|
||
MU: "μ", // U+03BC
|
||
PI: "π", // U+03C0
|
||
SIGMA: "σ", // U+03C3
|
||
TAU: "τ", // U+03C4
|
||
PHI: "φ", // U+03C6
|
||
|
||
// Mathematical operators
|
||
FORALL: "∀", // U+2200
|
||
EXISTS: "∃", // U+2203
|
||
ELEMENT_OF: "∈", // U+2208
|
||
NOT_ELEMENT_OF: "∉", // U+2209
|
||
SUBSET: "⊂", // U+2282
|
||
SUPERSET: "⊃", // U+2283
|
||
UNION: "∪", // U+222A
|
||
INTERSECTION: "∩", // U+2229
|
||
|
||
// Arrows and other symbols
|
||
RIGHTWARDS_ARROW: "→", // U+2192
|
||
LEFTWARDS_ARROW: "←", // U+2190
|
||
UP_ARROW: "↑", // U+2191
|
||
DOWN_ARROW: "↓", // U+2193
|
||
} as const;
|