Removing entities from HTML in Cocoa

To display accented characters and certain symbols in a HTML or XML document you need to encode them. For example the copyright symbol © is represented in HTML as ©

Applications like NewsMac Pro need to be able to decode these entities and translate them to the appropriate character. Straightforward you might think, but actually it isn’t. There are multiple ways in which characters can be encoded, as before with a textual name, but also with a decimal or hex value. In NewsMac Pro I used to use NSAttributtedString’s initWithHTML method, however for what ever reason this seem to lock up under Tiger, so I had to find an alternative solution. I thought I’d post the following code to help out other developers because if you go searching on this topic you will most likely get people telling you to use the NSAttributedString method.

This probably isn’t the most elegant bit of code ever, but it serves its purpose:

+ (NSString *) decodeCharacterEntitiesIn:(NSString *)source
{ 
  if(!source) return nil;
  else if([source rangeOfString: @"&"].location == NSNotFound) return source;
  else
  {
    NSArray *codes = [NSArray arrayWithObjects: 
      @" ", @"¡", @"¢", @"£", @"¤", @"¥", @"¦",
      @"§", @"¨", @"©", @"ª", @"«", @"¬", @"­", @"®",
      @"¯", @"°", @"±", @"²", @"³", @"´", @"µ",
      @"¶", @"·", @"¸", @"¹", @"º", @"»", @"¼",
      @"½", @"¾", @"¿", @"À", @"Á", @"Â",
      @"Ã", @"Ä", @"Å", @"Æ", @"Ç", @"È",
      @"É", @"Ê", @"Ë", @"Ì", @"Í", @"Î", @"Ï",
      @"Ð", @"Ñ", @"Ò", @"Ó", @"Ô", @"Õ", @"Ö",
      @"×", @"Ø", @"Ù", @"Ú", @"Û", @"Ü", @"Ý",
      @"Þ", @"ß", @"à", @"á", @"â", @"ã", @"ä",
      @"å", @"æ", @"ç", @"è", @"é", @"ê", @"ë",
      @"ì", @"í", @"î", @"ï", @"ð", @"ñ", @"ò",
      @"ó", @"ô", @"õ", @"ö", @"÷", @"ø", @"ù",
      @"ú", @"û", @"ü", @"ý", @"þ", @"ÿ", nil];
    
    NSArray *highCodes = [NSArray arrayWithObjects: @"Œ",   // 338
                                                    @"œ",   // 339
                                                    @"Š",  // 352
                                                    @"š",  // 353 
                                                    @"Ÿ",    // 376
                                                    @"ˆ",    // 710
                                                    @"˜",   // 732
                                                    @"–",   // 8211
                                                    @"—",   // 8212
                                                    @"‘",   // 8216
                                                    @"’",   // 8217
                                                    @"‚",   // 8218
                                                    @"“",   // 8220
                                                    @"”",   // 8221
                                                    @"„",   // 8222
                                                    @"†",  // 8224
                                                    @"‡",  // 8225
                                                    @"…",  // 8230
                                                    @"‰",  // 8240
                                                    @"‹",  // 8249
                                                    @"›",  // 8250
                                                    @"€",    // 8364
                                                    nil];
    int highCodeNumbers[22] = { 338, 339, 352, 353, 376, 710, 732, 8211, 8212,
                              8216, 8217, 8218, 8220, 8221, 8222, 8224, 8225,
                              8230, 8240, 8249, 8250, 8364 }; // 22 ints
    
    // decode basic XML entities:
    NSMutableString *escaped = [NSMutableString stringWithString: 
         (NSString *)CFXMLCreateStringByUnescapingEntities (NULL, (CFStringRef)source, NULL)];

    // Html
    int i, count = [codes count];
    for(i = 0; i < count; i++)
    {
      NSRange range = [source rangeOfString: [codes objectAtIndex: i]];
      if(range.location != NSNotFound)
      {
        [escaped replaceOccurrencesOfString: [codes objectAtIndex: i] 
                                 withString: [NSString stringWithFormat: @"%C", 160 + i] 
                                    options: NSLiteralSearch 
                                      range: NSMakeRange(0, [escaped length])];
      }
    }
    
    count = [highCodes count];
    
    // Html high codes
    for(i = 0; i < count; i++)
    {
      NSRange range = [source rangeOfString: [highCodes objectAtIndex: i]];
      if(range.location != NSNotFound)
      {
        [escaped replaceOccurrencesOfString: [highCodes objectAtIndex: i] 
                                 withString: [NSString stringWithFormat: @"%C", highCodeNumbers[i]] 
                                    options: NSLiteralSearch 
                                      range: NSMakeRange(0, [escaped length])];
      }
    }
    
    // Decimal & Hex
    NSRange start, finish, searchRange = NSMakeRange(0, [escaped length]);
    i = 0;
    
    while(i < [escaped length]) { start = [escaped rangeOfString: @"&#" options: NSCaseInsensitiveSearch range: searchRange]; finish = [escaped rangeOfString: @";" options: NSCaseInsensitiveSearch range: searchRange]; if(start.location != NSNotFound && finish.location != NSNotFound && finish.location > start.location && finish.location - start.location < 5)
      {
        NSRange entityRange = NSMakeRange(start.location, (finish.location - start.location) + 1);
        NSString *entity = [escaped substringWithRange: entityRange];     
        NSString *value = [entity substringWithRange: NSMakeRange(2, [entity length] - 2)];
        
        [escaped deleteCharactersInRange: entityRange];
        
        if([value hasPrefix: @"x"])
        {
          unsigned int tempInt = 0;
          NSScanner *scanner = [NSScanner scannerWithString: [value substringFromIndex: 1]];
          [scanner scanHexInt: &tempInt];
          [escaped insertString: [NSString stringWithFormat: @"%C", tempInt] atIndex: entityRange.location];
        }
        else
        {
          [escaped insertString: [NSString stringWithFormat: @"%C", [value intValue]] atIndex: entityRange.location];
        }
        i = start.location;
      }
      else
      {
        //i++;
        break;
      }
      searchRange = NSMakeRange(i, [escaped length] - i);
    }
    return escaped;    
  }
}

2 Comments on “Removing entities from HTML in Cocoa

  1. The codes array does currently not map properly because it has as its first 4 entries @"&", @"<", @">" and @""", which are not the equivalent of 160, 161, 162, and 163. This also causes the other entries to be off by 4.

    • I’d fixed that bug long ago but forgotten to update the blog post. I’ve updated the code to use the built-in CFXMLCreateStringByUnescapingEntities() function to decode the basic entities before tackling everything else. It’s too bad this function doesn’t do all the work!

Leave a Reply

Your email address will not be published. Required fields are marked *