import unidecode from 'unidecode';

/**
 * Maps a codepoint in the Unicode Mathematical Alphanumeric Symbols block
 * (U+1D400–U+1D7FF) to its plain ASCII equivalent.
 *
 * unidecode v1.x has no entries for this block, so we use a direct structural
 * mapping. Each style (Bold, Italic, Script, etc.) is a contiguous run of
 * 26 uppercase + 26 lowercase letters, or 10 digits.
 */
const MATH_ALPHA_STYLES: [number, number, number][] = [
  [0x1d400, 0x1d419, 65], [0x1d41a, 0x1d433, 97], // Bold A-Z / a-z
  [0x1d434, 0x1d44d, 65], [0x1d44e, 0x1d467, 97], // Italic
  [0x1d468, 0x1d481, 65], [0x1d482, 0x1d49b, 97], // Bold Italic
  [0x1d49c, 0x1d4b5, 65], [0x1d4b6, 0x1d4cf, 97], // Script
  [0x1d4d0, 0x1d4e9, 65], [0x1d4ea, 0x1d503, 97], // Bold Script  (𝓐–𝓩, 𝓪–𝔃)
  [0x1d504, 0x1d51d, 65], [0x1d51e, 0x1d537, 97], // Fraktur
  [0x1d538, 0x1d551, 65], [0x1d552, 0x1d56b, 97], // Double-struck (𝔸–ℤ, 𝕒–𝕫)
  [0x1d56c, 0x1d585, 65], [0x1d586, 0x1d59f, 97], // Bold Fraktur
  [0x1d5a0, 0x1d5b9, 65], [0x1d5ba, 0x1d5d3, 97], // Sans-serif
  [0x1d5d4, 0x1d5ed, 65], [0x1d5ee, 0x1d607, 97], // Sans-serif Bold
  [0x1d608, 0x1d621, 65], [0x1d622, 0x1d63b, 97], // Sans-serif Italic
  [0x1d63c, 0x1d655, 65], [0x1d656, 0x1d66f, 97], // Sans-serif Bold Italic
  [0x1d670, 0x1d689, 65], [0x1d68a, 0x1d6a3, 97], // Monospace
  [0x1d7ce, 0x1d7d7, 48], [0x1d7d8, 0x1d7e1, 48], // Bold / Double-struck digits
  [0x1d7e2, 0x1d7eb, 48], [0x1d7ec, 0x1d7f5, 48], // Sans-serif / Sans-serif Bold digits
  [0x1d7f6, 0x1d7ff, 48],                          // Monospace digits
];

const mathAlphanumericToAscii = (cp: number): string | null => {
  for (const [start, end, base] of MATH_ALPHA_STYLES)
    if (cp >= start && cp <= end) return String.fromCharCode(base + (cp - start));
  return null;
};

/**
 * Letterlike Symbols (U+2100–U+214F): styled single letters used in math/fancy text.
 * e.g. ℍ → H, ℝ → R, ℤ → Z
 */
const LETTERLIKE_MAP: Record<number, string> = {
  0x2102: 'C', 0x210a: 'g', 0x210b: 'H', 0x210c: 'H', 0x210d: 'H',
  0x210e: 'h', 0x210f: 'h', 0x2110: 'I', 0x2111: 'I', 0x2112: 'L',
  0x2113: 'l', 0x2115: 'N', 0x2119: 'P', 0x211a: 'Q', 0x211b: 'R',
  0x211c: 'R', 0x211d: 'R', 0x2124: 'Z', 0x2128: 'Z', 0x212c: 'B',
  0x212d: 'C', 0x212f: 'e', 0x2130: 'E', 0x2131: 'F', 0x2133: 'M',
  0x2134: 'o',
};

/**
 * Small Capital / Phonetic letters used as fancy name substitutes.
 * Covers Phonetic Extensions (1D00–1D2F), IPA Extensions (0250–02AF),
 * and Latin Extended-D small caps (A720–A7FF).
 * e.g. ᴅ → D, ʜ → H, ᴀ → A, ʀ → R, ꜱ → S
 */
const SMALLCAPS_MAP: Record<number, string> = {
  // Phonetic Extensions
  0x1d00: 'A', 0x1d01: 'AE', 0x1d03: 'B', 0x1d04: 'C', 0x1d05: 'D',
  0x1d06: 'D', 0x1d07: 'E', 0x1d08: 'e', 0x1d09: 'i', 0x1d0a: 'J',
  0x1d0b: 'K', 0x1d0c: 'L', 0x1d0d: 'M', 0x1d0e: 'N', 0x1d0f: 'O',
  0x1d10: 'O', 0x1d18: 'P', 0x1d19: 'R', 0x1d1a: 'R', 0x1d1b: 'T',
  0x1d1c: 'U', 0x1d20: 'V', 0x1d21: 'W', 0x1d22: 'Z',
  // IPA Extensions
  0x0299: 'B', 0x029c: 'H', 0x029f: 'L', 0x0280: 'R', 0x0274: 'N',
  0x026a: 'I', 0x0262: 'G', 0x0261: 'g', 0x028f: 'Y', 0x028b: 'v',
  0x0279: 'r', 0x027e: 'r', 0x0265: 'h', 0x028c: 'v', 0x0264: 'o',
  0x025b: 'e', 0x0254: 'o', 0x0251: 'a', 0x0250: 'a',
  // Latin Extended-D small caps
  0xa731: 'S', 0xa730: 'F', 0xa7ae: 'I',
};

/**
 * Returns true for codepoints that are native script letters we preserve as-is.
 * Covers all major Indic scripts and Arabic letters (not Arabic punctuation).
 * CJK / Hangul / Kana are intentionally excluded — they go through the unidecode
 * fallback so decorative uses (e.g. ▄︻デ gun symbols) are stripped while still
 * allowing the fallback to handle genuine CJK names gracefully.
 */
const isNativeScriptLetter = (cp: number): boolean =>
  (cp >= 0x0900 && cp <= 0x097f) || // Devanagari (Hindi)
  (cp >= 0x0980 && cp <= 0x09ff) || // Bengali
  (cp >= 0x0a00 && cp <= 0x0a7f) || // Gurmukhi (Punjabi)
  (cp >= 0x0a80 && cp <= 0x0aff) || // Gujarati
  (cp >= 0x0b00 && cp <= 0x0b7f) || // Odia
  (cp >= 0x0b80 && cp <= 0x0bff) || // Tamil
  (cp >= 0x0c00 && cp <= 0x0c7f) || // Telugu
  (cp >= 0x0c80 && cp <= 0x0cff) || // Kannada
  (cp >= 0x0d00 && cp <= 0x0d7f) || // Malayalam
  (cp >= 0x0620 && cp <= 0x064a);   // Arabic letters (excludes punctuation like ٭ U+066D)

/**
 * Sanitizes a text string for use in lead/contact name fields.
 *
 * Processing order per character:
 * 1. Math Alphanumeric Symbols (𝓐–𝓩, 𝕒–𝕫, 𝗵, 𝘔, etc.) → ASCII via structural map
 * 2. Plain ASCII printable (space–~) → kept as-is
 * 3. Native script letters (Hindi, Arabic, Indic scripts) → kept as-is
 * 4. Letterlike Symbols (ℍ, ℝ, ℤ…) + Small Caps / Phonetic (ᴅ, ʜ, ᴀ, ʀ, ꜱ…) → explicit map
 * 5. Everything else → unidecode transliteration; accepted only when the result is
 *    exactly 1 alphanumeric character (prevents symbol expansions like ° → "deg",
 *    ё → "io", デ → "de" from leaking through)
 *
 * Collapses resulting extra whitespace and trims.
 */
export const smartSanitize = (text: string): string => {
  if (!text || typeof text !== 'string') return text;
  return [...text]
    .map((char) => {
      const cp = char.codePointAt(0) ?? 0;

      const math = mathAlphanumericToAscii(cp);
      if (math !== null) return math;

      if (cp >= 0x0020 && cp <= 0x007e) return char;

      if (isNativeScriptLetter(cp)) return char;

      const extra = LETTERLIKE_MAP[cp] ?? SMALLCAPS_MAP[cp] ?? null;
      if (extra !== null) return extra;

      const t = unidecode(char).replace(/[^a-zA-Z0-9]/g, '');
      return t.length === 1 ? t : '';
    })
    .join('')
    .replace(/\s+/g, ' ')
    .trim();
};

const TEXT_FIELDS = [
  'firstName',
  'lastName',
  'email',
  'secondaryEmail',
  'notes',
  'remarks',
];

/**
 * Applies smartSanitize to all known text fields on a lead import row.
 */
export const sanitizeLeadRow = (row: Record<string, any>): Record<string, any> => {
  const sanitized = { ...row };
  for (const field of TEXT_FIELDS) {
    if (typeof sanitized[field] === 'string') {
      sanitized[field] = smartSanitize(sanitized[field]);
    }
  }
  return sanitized;
};
