Files
voidraft/frontend/src/common/prettier/plugins/scala/scala-parser/unicode-utils.ts

183 lines
5.4 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Unicode utilities for Scala parser
* Handles Unicode normalization and character validation
*/
/**
* Normalizes Unicode strings using NFC (Canonical Decomposition, followed by Canonical Composition)
* This ensures consistent representation of Unicode characters.
*
* @param text - The input text to normalize
* @returns The normalized text
*/
export function normalizeUnicode(text: string): string {
return text.normalize("NFC");
}
/**
* Checks if a character is a valid Scala identifier start character
* Follows Unicode identifier specification for Scala
*
* @param char - The character to check
* @returns True if the character can start an identifier
*/
export function isIdentifierStart(char: string): boolean {
if (char.length !== 1) return false;
const codePoint = char.codePointAt(0);
if (codePoint === undefined) return false;
// Basic ASCII identifier characters
if (
(codePoint >= 0x41 && codePoint <= 0x5a) || // A-Z
(codePoint >= 0x61 && codePoint <= 0x7a) || // a-z
codePoint === 0x5f || // _
codePoint === 0x24
) {
// $
return true;
}
// Mathematical symbols range (extended)
if (
(codePoint >= 0x2200 && codePoint <= 0x22ff) || // Mathematical Operators
(codePoint >= 0x27c0 && codePoint <= 0x27ef) || // Miscellaneous Mathematical Symbols-A
(codePoint >= 0x2980 && codePoint <= 0x29ff) || // Miscellaneous Mathematical Symbols-B
(codePoint >= 0x2a00 && codePoint <= 0x2aff)
) {
// Supplemental Mathematical Operators
return true;
}
// Use Unicode property test for other characters (excluding digits for start characters)
const testRegex = /\p{L}|\p{Mn}|\p{Mc}|\p{Pc}/u;
return testRegex.test(char);
}
/**
* Checks if a character is a valid Scala identifier continuation character
*
* @param char - The character to check
* @returns True if the character can continue an identifier
*/
export function isIdentifierContinue(char: string): boolean {
if (char.length !== 1) return false;
const codePoint = char.codePointAt(0);
if (codePoint === undefined) return false;
// Basic ASCII identifier characters
if (
(codePoint >= 0x41 && codePoint <= 0x5a) || // A-Z
(codePoint >= 0x61 && codePoint <= 0x7a) || // a-z
(codePoint >= 0x30 && codePoint <= 0x39) || // 0-9
codePoint === 0x5f || // _
codePoint === 0x24
) {
// $
return true;
}
// Mathematical symbols range (extended)
if (
(codePoint >= 0x2200 && codePoint <= 0x22ff) || // Mathematical Operators
(codePoint >= 0x27c0 && codePoint <= 0x27ef) || // Miscellaneous Mathematical Symbols-A
(codePoint >= 0x2980 && codePoint <= 0x29ff) || // Miscellaneous Mathematical Symbols-B
(codePoint >= 0x2a00 && codePoint <= 0x2aff)
) {
// Supplemental Mathematical Operators
return true;
}
// Use Unicode property test for other characters (including format characters)
const testRegex = /\p{L}|\p{Mn}|\p{Mc}|\p{Nd}|\p{Pc}|\p{Cf}/u;
return testRegex.test(char);
}
/**
* Validates that a string is a valid Scala identifier
*
* @param identifier - The identifier string to validate
* @returns True if the string is a valid identifier
*/
export function isValidIdentifier(identifier: string): boolean {
if (!identifier || identifier.length === 0) return false;
// Normalize the identifier
const normalized = normalizeUnicode(identifier);
// Check first character
if (!isIdentifierStart(normalized[0])) return false;
// Check remaining characters
for (let i = 1; i < normalized.length; i++) {
if (!isIdentifierContinue(normalized[i])) return false;
}
return true;
}
/**
* Converts Unicode escape sequences in strings to actual Unicode characters
* Handles \uXXXX patterns in string literals
*
* @param text - The text containing Unicode escapes
* @returns The text with Unicode escapes converted to actual characters
*/
export function processUnicodeEscapes(text: string): string {
return text.replace(/\\u([0-9A-Fa-f]{4})/g, (_, hex) => {
const codePoint = parseInt(hex, 16);
return String.fromCharCode(codePoint);
});
}
/**
* Escapes Unicode characters in strings for safe output
* Converts non-ASCII characters back to \uXXXX format if needed
*
* @param text - The text to escape
* @param escapeNonAscii - Whether to escape all non-ASCII characters
* @returns The escaped text
*/
export function escapeUnicode(text: string, escapeNonAscii = false): string {
if (!escapeNonAscii) return text;
return text.replace(/[\u0080-\uFFFF]/g, (char) => {
const codePoint = char.charCodeAt(0);
return `\\u${codePoint.toString(16).padStart(4, "0").toUpperCase()}`;
});
}
/**
* Extended mathematical symbols commonly used in Scala functional programming
*/
export const MATHEMATICAL_SYMBOLS = {
// Greek letters commonly used in functional programming
ALPHA: "α", // U+03B1
BETA: "β", // U+03B2
GAMMA: "γ", // U+03B3
DELTA: "δ", // U+03B4
LAMBDA: "λ", // U+03BB
MU: "μ", // U+03BC
PI: "π", // U+03C0
SIGMA: "σ", // U+03C3
TAU: "τ", // U+03C4
PHI: "φ", // U+03C6
// Mathematical operators
FORALL: "∀", // U+2200
EXISTS: "∃", // U+2203
ELEMENT_OF: "∈", // U+2208
NOT_ELEMENT_OF: "∉", // U+2209
SUBSET: "⊂", // U+2282
SUPERSET: "⊃", // U+2283
UNION: "", // U+222A
INTERSECTION: "∩", // U+2229
// Arrows and other symbols
RIGHTWARDS_ARROW: "→", // U+2192
LEFTWARDS_ARROW: "←", // U+2190
UP_ARROW: "↑", // U+2191
DOWN_ARROW: "↓", // U+2193
} as const;