I recently needed a way to replace accented characters with simple english ones to allow more readable friendly urls. I'm sure there are plenty of Danes out there who are sick of seeing their language butchered by UrlEncode. After bit of reading up, it seems .Net 2.0 does 99% of the heavy lifting for you:
1: //using System.Text;
2:
3: /// <summary>
4: /// Replaces Accented Characters with Closest Equivalents
5: /// </summary>
6: /// <param name="original">The original.</param>
7: /// <returns></returns>
8: /// <remarks>Based on code from: http://blogs.msdn.com/b/michkap/archive/2007/05/14/2629747.aspx</remarks>
9: public static string ToSimpleCharacters(this string original)
10: {
11: if (string.IsNullOrEmpty(original)) return string.Empty;
12: string stFormD = original.Normalize(NormalizationForm.FormD);
13: StringBuilder sb = new StringBuilder();
14:
15: for (int ich = 0; ich < stFormD.Length; ich++)
16: {
17: UnicodeCategory uc = CharUnicodeInfo.GetUnicodeCategory(stFormD[ich]);
18: if (uc != UnicodeCategory.NonSpacingMark)
19: {
20: if (Lookup.ContainsKey(stFormD[ich]))
21: {
22: sb.Append(Lookup[stFormD[ich]]);
23: }
24: else
25: {
26: sb.Append(stFormD[ich]);
27: }
28: }
29: }
30:
31: return (sb.ToString().Normalize(NormalizationForm.FormC));
32: }
33:
34: private static Dictionary<char, string> _lookup;
35: private static Dictionary<char, string> Lookup
36: {
37: get
38: {
39: if (_lookup == null)
40: {
41: _lookup = new Dictionary<char, string>();
42: _lookup[char.ConvertFromUtf32(230)[0]] = "ae";//_lookup['æ']="ae";
43: _lookup[char.ConvertFromUtf32(198)[0]] = "Ae";//_lookup['Æ']="Ae";
44: _lookup[char.ConvertFromUtf32(240)[0]] = "d";//_lookup['ð']="d";
45: }
46: return _lookup;
47: }
48: }
I’m sure that there must be a few substitutions that don’t get caught by this code. If you’ve got one just drop me a line!
Updated code to substitue the ETH charater with a 'd' rather than 'o' - on the advice of JKos over on Code Project.
ReplyDelete