We have a web application that was supposed to send data to a legacy system that could only process the first 128 characters of an ASCII character set.
The solution we should use was something to “translate” as many characters as possible into equivalent ASCII equivalents, but leave everything that cannot be translated alone.
Normally I would do something like this:
<?php // transliterate if (function_exists('iconv')) { $text = iconv('utf-8', 'us-ascii//TRANSLIT', $text); } ?>
... but this replaces everything that cannot be translated into a question mark (?).
So, we are done doing the following. Check at the end of this function for a (commented) php regex that simply strips out non-ASCII characters.
<?php public function cleanNonAsciiCharactersInString($orig_text) { $text = $orig_text; // Single letters $text = preg_replace("/[∂άαáàâãªä]/u", "a", $text); $text = preg_replace("/[∆ΛÁÀÂÃÄ]/u", "A", $text); $text = preg_replace("/[Ђ]/u", "b", $text); $text = preg_replace("/[β]/u", "B", $text); $text = preg_replace("/[çς©]/u", "c", $text); $text = preg_replace("/[Ç]/u", "C", $text); $text = preg_replace("/[δ]/u", "d", $text); $text = preg_replace("/[éèêëέëèε℮є]/u", "e", $text); $text = preg_replace("/[ÉÈÊË€ξЄ€∑]/u", "E", $text); $text = preg_replace("/[₣]/u", "F", $text); $text = preg_replace("/[Њњ]/u", "H", $text); $text = preg_replace("/[ђћЋ]/u", "h", $text); $text = preg_replace("/[ÍÌÎÏ]/u", "I", $text); $text = preg_replace("/[íìîïιίϊі]/u", "i", $text); $text = preg_replace("/[Јј]/u", "j", $text); $text = preg_replace("/[ΚЌ]/u", 'K', $text); $text = preg_replace("/[ќ]/u", 'k', $text); $text = preg_replace("/[ℓ∟]/u", 'l', $text); $text = preg_replace("/[]/u", "M", $text); $text = preg_replace("/[ñηήηπⁿ]/u", "n", $text); $text = preg_replace("/[Ñ∏Ν]/u", "N", $text); $text = preg_replace("/[óòôõºöοσό]/u", "o", $text); $text = preg_replace("/[ÓÒÔÕÖθΩθΩ]/u", "O", $text); $text = preg_replace("/[ρφ]/u", "p", $text); $text = preg_replace("/[®]/u", "R", $text); $text = preg_replace("/[Ѓѓ]/u", "r", $text); $text = preg_replace("/[Ѕ]/u", "S", $text); $text = preg_replace("/[ѕ]/u", "s", $text); $text = preg_replace("/[]/u", "T", $text); $text = preg_replace("/[τ†‡]/u", "t", $text); $text = preg_replace("/[úùûüџμΰµυϋύ]/u", "u", $text); $text = preg_replace("/[√]/u", "v", $text); $text = preg_replace("/[ÚÙÛÜЏ]/u", "U", $text); $text = preg_replace("/[Ψψωώẅẃẁ]/u", "w", $text); $text = preg_replace("/[ẀẄẂ]/u", "W", $text); $text = preg_replace("/[Χχ]/u", "x", $text); $text = preg_replace("/[ỲΫ¥]/u", "Y", $text); $text = preg_replace("/[ỳγўЎ]/u", "y", $text); $text = preg_replace("/[ζ]/u", "Z", $text); // Punctuation $text = preg_replace("/[‚‚]/u", ",", $text); $text = preg_replace("/[`‛′'']/u", "'", $text); $text = preg_replace("/[″""«»„]/u", '"', $text); $text = preg_replace("/[—–―−–‾⌐─↔→←]/u", '-', $text); $text = preg_replace("/[ ]/u", ' ', $text); $text = str_replace("…", "...", $text); $text = str_replace("≠", "!=", $text); $text = str_replace("≤", "<=", $text); $text = str_replace("≥", ">=", $text); $text = preg_replace("/[‗≈≡]/u", "=", $text); // Exciting combinations $text = str_replace("", "bl", $text); $text = str_replace("℅", "c/o", $text); $text = str_replace("₧", "Pts", $text); $text = str_replace("™", "tm", $text); $text = str_replace("№", "No", $text); $text = str_replace("", "4", $text); $text = str_replace("‰", "%", $text); $text = preg_replace("/[∙•]/u", "*", $text); $text = str_replace("‹", "<", $text); $text = str_replace("›", ">", $text); $text = str_replace("‼", "!!", $text); $text = str_replace("⁄", "/", $text); $text = str_replace("∕", "/", $text); $text = str_replace("⅞", "7/8", $text); $text = str_replace("⅝", "5/8", $text); $text = str_replace("⅜", "3/8", $text); $text = str_replace("⅛", "1/8", $text); $text = preg_replace("/[‰]/u", "%", $text); $text = preg_replace("/[Љљ]/u", "Ab", $text); $text = preg_replace("/[]/u", "IO", $text); $text = preg_replace("/[fifl]/u", "fi", $text); $text = preg_replace("/[]/u", "3", $text); $text = str_replace("£", "(pounds)", $text); $text = str_replace("₤", "(lira)", $text); $text = preg_replace("/[‰]/u", "%", $text); $text = preg_replace("/[↨↕↓↑│]/u", "|", $text); $text = preg_replace("/[∞∩∫⌂⌠⌡]/u", "", $text); //2) Translation CP1252. $trans = get_html_translation_table(HTML_ENTITIES); $trans['f'] = 'ƒ'; // Latin Small Letter F With Hook $trans['-'] = array( '…', // Horizontal Ellipsis '˜', // Small Tilde '–' // Dash ); $trans["+"] = '†'; // Dagger $trans['#'] = '‡'; // Double Dagger $trans['M'] = '‰'; // Per Mille Sign $trans['S'] = 'Š'; // Latin Capital Letter S With Caron $trans['OE'] = 'Œ'; // Latin Capital Ligature OE $trans["'"] = array( '‘', // Left Single Quotation Mark '’', // Right Single Quotation Mark '›', // Single Right-Pointing Angle Quotation Mark '‚', // Single Low-9 Quotation Mark 'ˆ', // Modifier Letter Circumflex Accent '‹' // Single Left-Pointing Angle Quotation Mark ); $trans['"'] = array( '“', // Left Double Quotation Mark '”', // Right Double Quotation Mark '„', // Double Low-9 Quotation Mark ); $trans['*'] = '•'; // Bullet $trans['n'] = '–'; // En Dash $trans['m'] = '—'; // Em Dash $trans['tm'] = '™'; // Trade Mark Sign $trans['s'] = 'š'; // Latin Small Letter S With Caron $trans['oe'] = 'œ'; // Latin Small Ligature OE $trans['Y'] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis $trans['euro'] = '€'; // euro currency symbol ksort($trans); foreach ($trans as $k => $v) { $text = str_replace($v, $k, $text); } // 3) remove <p>, <br/> ... $text = strip_tags($text); // 4) & => & " => ' $text = html_entity_decode($text); // transliterate // if (function_exists('iconv')) { // $text = iconv('utf-8', 'us-ascii//TRANSLIT', $text); // } // remove non ascii characters // $text = preg_replace('/[\x00-\x1F\x80-\xFF]/', '', $text); return $text; } ?>