Roxen.git / server / modules / tags / check_spelling.pike

version» Context lines:

Roxen.git/server/modules/tags/check_spelling.pike:1:   // This is a roxen module. Copyright © 2000 - 2009, Roxen IS.   //      #include <module.h>   inherit "module";      constant thread_safe=1;    - constant cvs_version = "$Id: check_spelling.pike,v 1.39 2012/04/13 17:08:30 jonasw Exp $"; + constant cvs_version = "$Id$";      constant module_type = MODULE_TAG|MODULE_PROVIDER;   constant module_name = "Tags: Spell checker";   constant module_doc =   #"Checks for misspelled words using the <tt>&lt;emit#spellcheck&gt;</tt> or   <tt>&lt;spell&gt;</tt> tags.";      array(string) query_provides()   {    return ({ "spellchecker" });
Roxen.git/server/modules/tags/check_spelling.pike:64:       defvar("extra_dicts", ({ }), "Custom dictionaries",    TYPE_FILE_LIST,    "Paths to custom dictionary files. These should be plain-text "    "files with one word on each line. NOTE: Filenames must include "    "a valid language code before the file suffix, e.g. "    "<tt>mywords.en.txt</tt> or <tt>mywords.en_US.txt</tt>. "    "The plain-text files must also use UTF-8 encoding if you enable "    "the UTF-8 support in the setting below.");    +  defvar("run_together_langs", "sv", "Languages with run-together words", +  TYPE_STRING, +  "A comma-separated list of language codes where run-together words " +  "are considered valid. This behavior is useful in languages such as " +  "Swedish but not appropriate for English."); +     defvar("report", "popup", "Default report type", TYPE_STRING_LIST,    "The default report type used, when not specified in the "    "&lt;spell&gt; tag.",    ({ "popup","table" }) );       defvar("prestate", "", "Prestate",TYPE_STRING,    "If specified, only check spelling in the &lt;spell&gt; tag "    "when this prestate is present.");       defvar("use_utf8", 1, "Enable UTF-8 support",
Roxen.git/server/modules/tags/check_spelling.pike:114:   void start(int when, Configuration conf)   {    sync_extra_dicts();   }         string get_processed_dict_path(string extra_dict)   {    // Hash the external path and return a corresponding item in $VARDIR    string ed_hash = -  lower_case(String.string2hex(Crypto.MD5()->hash(extra_dict))); +  lower_case(String.string2hex(Crypto.MD5.hash(extra_dict)));    return    combine_path(getcwd(),    roxen_path("$VARDIR/check_spelling/" + ed_hash + ".dict"));   }      string|void get_extra_dict_language(string ed_path)   {    // Only accept filenames structured as mywords.en.txt. We require    // at least two "." and a non-empty language code.    string ed_name = basename(ed_path);
Roxen.git/server/modules/tags/check_spelling.pike:159:    } else if (include_empty) {    // Not yet processed but a valid candidate    res[ed_path] = pd_path;    }    }    }    return res;   }       + // Returns tuple < encoding, chars to skip > if the given data string + // starts with a BOM, and zero otherwise. + array(string|int) get_encoding_from_bom(string data) + { +  // We only care about UTF-8 and UTF-16 BE/LE: +  // +  // EF BB BF - UTF-8 +  // FE FF - UTF-16 big-endian +  // FF FE - UTF-16 little-endian +  if (sizeof(data) >= 3) { +  if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) +  return ({ "utf-8", 3 }); +  } +  if (sizeof(data) >= 2) { +  if (data[0] == 0xFE && data[1] == 0xFF) +  return ({ "utf-16", 2 }); +  if (data[0] == 0xFF && data[1] == 0xFE) +  return ({ "utf-16le", 2 }); +  } +  return 0; + } +  +    int process_extra_dict(string ed_path, string pd_path)   {    // Make sure destination directory exists    mkdirhier(dirname(pd_path) + "/");       // Convert the extra_dict source file (plain-text file) in ed_path    // and write it to the processed dictionary at pd_path.    string aspell_binary = query("spellchecker");    if (!Stdio.exist(aspell_binary))    return -1;       int use_utf8 = query("use_utf8");    array(string) args =    ({ aspell_binary, "--lang", get_extra_dict_language(ed_path) }) +    (use_utf8 ? ({ "--encoding", "utf-8" }) : ({ }) ) +    ({ "create", "master", pd_path });    report_notice("Spell Checker: Converting dictionary %s... ", ed_path);    -  Stdio.File in_file = Stdio.File(ed_path); +  // Aspell doesn't like MS-DOS line endings so write a clean temp file. +  // We also heed any BOM that we find. +  string in_data = Stdio.read_bytes(ed_path); +  if (!in_data) { +  report_notice("Error reading dictionary: %s\n", ed_path); +  return -1; +  } +  if (array bom_data = get_encoding_from_bom(in_data)) { +  // Skip BOM bytes and recode to UTF-8 if currently in a different format +  in_data = in_data[bom_data[1]..]; +  if (bom_data[0] != "utf-8") { +  if (Charset.Decoder dec = Charset.decoder(bom_data[0])) +  in_data = string_to_utf8(dec->feed(in_data)->drain()); +  } +  } +  in_data = replace(in_data, ({ "\r\n", "\r" }), ({ "\n", "\n" }) ); +  string ed_cleaned_path = ed_path + ".tmp"; +  if (mixed err = catch { +  Stdio.write_file(ed_cleaned_path, in_data); +  }) { +  report_notice("Error writing temp file: %s\n", ed_cleaned_path); +  return -1; +  } +  +  Stdio.File in_file = Stdio.File(ed_cleaned_path);    Process.Process p = Process.Process(args, ([ "stdin": in_file ]) );    in_file->close();    int err = p->wait(); -  +  rm(ed_cleaned_path);    report_notice((err ? "Error" : "OK") + "\n");    return err;   }         void sync_extra_dicts(void|int force_rebuild)   {    // Stat each of the configured extra dictionaries and check whether our    // compressed versions are out-of-date.    foreach (get_extra_dicts(1); string ed_path; string pd_path) {
Roxen.git/server/modules/tags/check_spelling.pike:376:   string run_spellcheck(string|array(string) words, void|string dict)   // Returns 0 on failure.   {    // Sync any custom dictionaries in case they have been edited, and    // fetch a list of valid ones to add in this run.    sync_extra_dicts();    mapping(string:string) extra_dicts = get_extra_dicts();    array(string) ed_args = ({ });    foreach (extra_dicts; string ed_path; string pd_path) {    if (pd_path) -  ed_args += ({ "--extra-dicts", pd_path }); +  ed_args += ({ "--add-extra-dicts", pd_path });    }    -  +  // Should run-together words be considered? +  array(string) run_together_langs = +  map(query("run_together_langs") / ",", String.trim_all_whites); +  int use_run_together = dict && has_value(run_together_langs, dict); +     object file1=Stdio.File();    object file2=file1->pipe();    object file3=Stdio.File();    object file4=file3->pipe();    string spell_res;    int use_utf8 = query("use_utf8");       if(stringp(words))    words = replace(words, "\n", " ");    if(!Stdio.exist(query("spellchecker")))    {    werror("check_spelling: Missing binary in %s\n", query("spellchecker"));    return 0;    }    Process.Process p = -  Process.Process(({ query("spellchecker"), "-a", "-C" }) + +  Process.Process(({ query("spellchecker"), "-a" }) + +  (use_run_together ? ({ "-C" }) : ({ }) ) +    (use_utf8 ? ({ "--encoding=utf-8" }) : ({ }) ) +    (stringp(words) ? ({ "-H" }) : ({ }) ) +    (dict ? ({ "-d", dict }) : ({ }) ) +    ed_args,    ([ "stdin":file2,"stdout":file4 ]));       string text = stringp(words) ?    " "+words /* Extra space to ignore aspell commands    (potential security problem), compensated    below. */ :    " "+words*"\n "+"\n" /* Compatibility mode. */;       // Aspell 0.60 or later understands UTF-8 encoding natively    if (use_utf8)    text = string_to_utf8(text);    else -  text = Locale.Charset.encoder("iso-8859-1", "\xa0")->feed(text)->drain(); +  text = Charset.encoder("iso-8859-1", "\xa0")->feed(text)->drain();       Stdio.sendfile(({ text }), 0, 0, -1, 0, file1,    lambda(int bytes) { file1->close(); });       file2->close();    file4->close();    spell_res=file3->read();    file3->close();       if (use_utf8 && spell_res)