Branch: Tag:

2006-09-11

2006-09-11 14:43:51 by Henrik Grubbström (Grubba) <grubba@grubba.org>

Added support for extraction from PostScript (ie EPS) files.

Rev: lib/modules/Standards.pmod/IIM.pmod:1.3

3:   //   // http://www.iptc.org/IIM/   // - // $Id: IIM.pmod,v 1.2 2006/03/02 18:35:42 grubba Exp $ + // $Id: IIM.pmod,v 1.3 2006/09/11 14:43:51 grubba Exp $   //   // Anders Johansson & Henrik Grubbström   
117:    //return (str[1]<<8)|str[0];   }    -  - mapping get_information(Stdio.File fd) + static mapping(string:string|array(string)) decode_photoshop_data(string data)   { -  mapping res = ([]); +  mapping(string:string|array(string)) res = ([]);    -  string jpeg_marker = fd->read(2); -  if (jpeg_marker != "\xff\xd8") { -  //werror("unknown JPEG marker: %O\n", jpeg_marker); -  return res; -  } -  -  // IIMV 4.1 Chapter 3 Section 1.6 (a): -  // Record 1:xx shall use coded character set ISO 646 International -  // Reference Version or ISO 4873 Default Version. -  // -  // IIMV 4.1 Chapter 5 1:90: -  // The control functions apply to character oriented DataSets in -  // records 2-6. They also apply to record 8, unless the objectdata -  // explicitly, or the File Format implicitly, defines character sets -  // otherwise. -  // [...] -  // If 1:90 is omitted, the default for records 2-6 and 8 is ISO 646 -  // IRV (7 bits) or ISO 4873 DV (8 bits). Record 1 shall always use -  // ISO 646 IRV or ISO 4873 DV respectively. -  // -  // In practice the above of course isn't true, and it seems -  // that macintosh encoding is used in place of ISO 4873 DV. -  // -  // 1: "iso646irv" or "iso4873dv", -  // -  // Most application record fields seem to be encoded -  // with the macintosh charset. -  // -  // This has been verified for the fields: -  // "by-line" -  // "caption/abstract" -  // "city" -  // "copyright notice" -  // "headline" -  // "keywords" -  // "object name" -  // "source" -  // "special instructions" -  // "supplemental category" -  // "writer/editor" -  // and is assumed for the remainder. -  // -  // Some do however (eg Nyhedstjeneste in Denmark) use ISO-8859-1. -  // -  // We attempt some DWIM further down. -  -  do { -  string app = fd->read(2); -  if (sizeof(app) != 2) -  break; -  string length_s = fd->read(2); -  int length; -  if (sizeof(length_s) == 2) -  length = short_value(length_s); -  //werror ("length: %O\n", short_value(length_s)); -  -  if (app == "\xff\xed") // APP14 Photoshop -  { -  string data = fd->read(length-2); -  //werror("data: %O\n", data); -  +     // 0x0404 is IPTC IIM    array blocks = (data / "8BIM\4\4")[1..];    if (!sizeof(blocks)) {    werror("No 8BIM/IPTC IIM markers found in data\n"); -  break; +  return res;    }    //werror("blocks: %O\n", blocks);    foreach(blocks, string block) {
271:    res[label] = ({ data });    }    } +  return res; + } +  + mapping get_information(Stdio.File fd) + { +  string marker = fd->read(2); +  string photoshop_data = ""; +  +  if (marker == "%!") { +  int bytes = -1; +  // Note: We use the split iterator by hand to make sure '\r' is +  // valid as a line terminator. +  foreach(String.SplitIterator(marker, (<'\r','\n'>), 1, +  fd->read_function(8192)); +  int lineno; string line) { +  if (line[0] != '%') continue; +  if (bytes < 0) sscanf(line, "%%BeginPhotoshop: %d", bytes); +  else if (has_prefix(line, "% ")) { +  photoshop_data += String.hex2string(line[2..]); +  if (sizeof(photoshop_data) >= bytes) break; +  } +  else if (has_prefix(line, "%EndPhotoshop")) {    break;    } -  +  } +  } else if (marker == "\xff\xd8") { +  do { +  string app = fd->read(2); +  if (sizeof(app) != 2) +  break; +  string length_s = fd->read(2); +  int length; +  if (sizeof(length_s) == 2) +  length = short_value(length_s); +  //werror ("length: %O\n", short_value(length_s));    -  fd->read(length-2); +  string data = fd->read(length-2); +  if (app == "\xff\xed") // APP14 Photoshop +  { +  //werror("data: %O\n", data); +  photoshop_data = data; +  break; +  }    } while (1); -  +  } else { +  //werror("unknown marker: %O neither JPEG nor Postscript\n", marker); +  return ([]); +  }    -  +  if (!sizeof(photoshop_data)) return ([]); +  +  mapping res = decode_photoshop_data(photoshop_data); +     if (sizeof(res)) { -  +  // IIMV 4.1 Chapter 3 Section 1.6 (a): +  // Record 1:xx shall use coded character set ISO 646 International +  // Reference Version or ISO 4873 Default Version. +  // +  // IIMV 4.1 Chapter 5 1:90: +  // The control functions apply to character oriented DataSets in +  // records 2-6. They also apply to record 8, unless the objectdata +  // explicitly, or the File Format implicitly, defines character sets +  // otherwise. +  // [...] +  // If 1:90 is omitted, the default for records 2-6 and 8 is ISO 646 +  // IRV (7 bits) or ISO 4873 DV (8 bits). Record 1 shall always use +  // ISO 646 IRV or ISO 4873 DV respectively. +  // +  // In practice the above of course isn't true, and it seems +  // that macintosh encoding is used in place of ISO 4873 DV. +  // +  // 1: "iso646irv" or "iso4873dv", +  // +  // Most application record fields seem to be encoded +  // with the macintosh charset. +  // +  // This has been verified for the fields: +  // "by-line" +  // "caption/abstract" +  // "city" +  // "copyright notice" +  // "headline" +  // "keywords" +  // "object name" +  // "source" +  // "special instructions" +  // "supplemental category" +  // "writer/editor" +  // and is assumed for the remainder. +  // +  // Some do however (eg Nyhedstjeneste in Denmark) use ISO-8859-1. +  // +  // We attempt some DWIM... +     string charset;    if (!res->charset) {    charset = "macintosh";