Branch: Tag:

2006-03-02

2006-03-02 18:35:42 by Henrik Grubbström (Grubba) <grubba@grubba.org>

Improved character set detection and handling.

Rev: lib/modules/Standards.pmod/IIM.pmod:1.2

3:   //   // http://www.iptc.org/IIM/   // - // $Id: IIM.pmod,v 1.1 2006/03/01 16:37:27 grubba Exp $ + // $Id: IIM.pmod,v 1.2 2006/03/02 18:35:42 grubba Exp $   //   // Anders Johansson & Henrik Grubbström   
83:    200: "objectdata preview file format",    201: "objectdata preview file format version",    202: "objectdata preview data", +  +  // This one seems to contain a charset name... +  // eg "CP_1252" or "CP_2" +  183: "charset",    ]),    3: ([ ]), // DIGITAL NEWSPHOTO PARAMETER    4: ([ ]), // Not Allocated,
102:    ])    ]);    - // Most application record fields seem to be encoded - // with the macintosh charset. - // - // This has been verified for the fields: - // "by-line" - // "caption/abstract" - // "city" - // "copyright notice" - // "headline" - // "keywords" - // "object name" - // "source" - // "special instructions" - // "supplemental category" - // "writer/editor" - // and is assumed for the remainder. - mapping(int:string) encodings = ([ -  // IIMV 4.1 Chapter 3 Section 1.6 (a): -  // Record 1:xx shall use coded character set ISO 646 International -  // Reference Version or ISO 4873 Default Version. -  // -  // IIMV 4.1 Chapter 5 1:90: -  // The control functions apply to character oriented DataSets in -  // records 2-6. They also apply to record 8, unless the objectdata -  // explicitly, or the File Format implicitly, defines character sets -  // otherwise. -  // [...] -  // If 1:90 is omitted, the default for records 2-6 and 8 is ISO 646 -  // IRV (7 bits) or ISO 4873 DV (8 bits). Record 1 shall always use -  // ISO 646 IRV or ISO 4873 DV respectively. -  // -  // In practice the above of course isn't true, and it seems -  // that macintosh encoding is used in place of ISO 4873 DV. -  // -  // 1: "iso646irv" or "iso4873dv", -  2: "macintosh", + mapping(int:multiset(int)) binary_fields = ([ +  1: (<20, 22>), +  2: (<0>),   ]);    -  -  +    static int short_value(string str)   {    return (str[0]<<8)|str[1];
159:    return res;    }    +  // IIMV 4.1 Chapter 3 Section 1.6 (a): +  // Record 1:xx shall use coded character set ISO 646 International +  // Reference Version or ISO 4873 Default Version. +  // +  // IIMV 4.1 Chapter 5 1:90: +  // The control functions apply to character oriented DataSets in +  // records 2-6. They also apply to record 8, unless the objectdata +  // explicitly, or the File Format implicitly, defines character sets +  // otherwise. +  // [...] +  // If 1:90 is omitted, the default for records 2-6 and 8 is ISO 646 +  // IRV (7 bits) or ISO 4873 DV (8 bits). Record 1 shall always use +  // ISO 646 IRV or ISO 4873 DV respectively. +  // +  // In practice the above of course isn't true, and it seems +  // that macintosh encoding is used in place of ISO 4873 DV. +  // +  // 1: "iso646irv" or "iso4873dv", +  // +  // Most application record fields seem to be encoded +  // with the macintosh charset. +  // +  // This has been verified for the fields: +  // "by-line" +  // "caption/abstract" +  // "city" +  // "copyright notice" +  // "headline" +  // "keywords" +  // "object name" +  // "source" +  // "special instructions" +  // "supplemental category" +  // "writer/editor" +  // and is assumed for the remainder. +  // +  // Some do however (eg Nyhedstjeneste in Denmark) use ISO-8859-1. +  // +  // We attempt some DWIM further down. +     do {    string app = fd->read(2);    if (sizeof(app) != 2)
213:    info = info[size+5..];       if (segment_marker != '\x1c') { -  werror("Unknown segment marker: %O\n", segment_marker); +  if (segment_marker == '\x6f') { +  // I have not found any documentation for this segment, +  // but I use it to detect Nyhedstjeneste. +  if ((record_set == 110) && (!id)) { +  res->charset = ({ "iso-8859-1" });    continue; -  //break; +     } -  +  } + #if 1 +  werror("Unknown segment marker: 0x%02x\n" +  "record_set: %d\n" +  "id: %d\n" +  "data: %O\n", segment_marker, record_set, id, data); + #endif /* 1 */ +  break; +  }       if (!has_value(indices(fields), record_set)) {    werror("Unknown record set marker: %O\n", record_set); -  continue; -  //break; +  break;    }       //werror("%3d: ", id); -  //werror("%s\n", data); +  //werror("%O\n", data);    //werror("info: %O\n", String.string2hex(info));    string label =    fields[record_set][id] ||    (string)record_set + ":" + (string)id;    -  if ((record_set == 2) && !id) { -  // The record version is binary encoded. -  data = (string)Gmp.mpz(data, 256); +  if (label == "coded character set") { +  if (data == "\e%5") { +  res->charset = (res->charset || ({})) + ({ "iso-8859-1" });    } -  +  }    -  string encoding; -  if (encoding = encodings[record_set]) { -  object decoder = Locale.Charset.decoder(encoding); -  catch { -  data = decoder->feed(data)->drain(); -  }; +  if ((binary_fields[record_set] && binary_fields[record_set][id]) || +  (<3, 7>)[record_set]) { +  // Decode binary fields. +  data = (string)Gmp.mpz(data, 256);    }    -  +  // werror("RAW: %O:%O\n", label, data); +     if (res[label])    res[label] += ({ data });    else
256:    fd->read(length-2);    } while (1);    +  if (sizeof(res)) { +  string charset; +  if (!res->charset) { +  charset = "macintosh"; +  } else { +  charset = lower_case(res->charset[0]); +  +  // Remap to standard names: +  charset = ([ +  "cp_1252":"windows1252", +  "cp_2":"macintosh", +  ])[charset] || charset; +  } +  //werror("Charset: %O\n", charset); +  res->charset = ({ charset }); +  object decoder = Locale.Charset.decoder(charset); +  foreach(res; string key; array(string) vals) { +  res[key] = map(vals, +  lambda(string val, object decoder) { +  return decoder->feed(val)->drain(); +  }, decoder); +  } +  } +     return res;   }