Difference between revisions of "Adding new dictionaries to aspell"

(How to add a new dictionary to aspell)
(How to add a new dictionary to aspell)
Line 61: Line 61:
  
 
* To add a set of custom words see this [http://aspell.sourceforge.net/man-html/Creating-an-Individual-Word-List.html How-To]
 
* To add a set of custom words see this [http://aspell.sourceforge.net/man-html/Creating-an-Individual-Word-List.html How-To]
 +
 +
 +
=== Adding Latin-2 dictionary support ===
 +
Many aspell dictionaries do not use UTF-8 encoding while UTF-8 is encoding of a choice for Zimbra suite.
 +
This means that text should be converted from UTF-8 to ISO-8859-x, spell checked ant then misspeled and sugested word should be
 +
translated back to UTF-8 for browser display. The following patch shows (Slovenian) modifications to Zimbra 4.5.9 spelling processor:
 +
 +
--- /opt/zimbra/httpd/htdocs/aspell.php.orig  2007-11-01 14:44:25.000000000 +0100
 +
+++ /opt/zimbra/httpd/htdocs/aspell.php        2007-11-01 15:50:04.000000000 +0100
 +
@@ -18,7 +18,7 @@
 +
 
 +
  $filename = "";
 +
  $text = "";
 +
-$locale = "en_EN";
 +
+$locale = "sl_SI";
 +
 
 +
  if (isset($_FILES["text"])) {
 +
      $text = file_get_contents($_FILES["text"]);
 +
@@ -33,12 +33,17 @@
 +
  if ($text != NULL) {
 +
      setlocale(LC_ALL, $locale);
 +
 
 +
+    // Convert all text into 8-bit dictionary locale
 +
+    $text=iconv("UTF-8", "iso-8859-2", $text);
 +
+
 +
      // Get rid of double-dashes, since we ignore dashes
 +
      // when splitting words
 +
      $text = preg_replace('/--+/', ' ', $text);
 +
 
 +
      // Split on anything that's not a word character, quote or dash
 +
-    $words = preg_split('/[^\w\'-]+/', $text);
 +
+    // $words = preg_split('/[^\w\'-]+/', $text);
 +
+    // Do not split visible characters in the range of 0xA0-0xFF
 +
+      $words = preg_split('/[^\w\'\xa0-\xff-]+/', $text);
 +
 
 +
      // Load dictionary
 +
      $dictionary = pspell_new($locale);
 +
@@ -79,14 +84,15 @@
 +
          } else {
 +
              $checked_words[$word] = 1;
 +
          }
 +
-
 +
          // Check spelling
 +
          if (!pspell_check($dictionary, $word)) {
 +
              $suggestions = implode(",", pspell_suggest($dictionary, $word));
 +
-            $suggestions = utf8_encode($suggestions);
 +
              $misspelled .= "$word:$suggestions\n";
 +
          }
 +
      }
 +
+  // Convert to dictionary locale
 +
+  $suggestions=iconv("iso-8859-2","UTF-8",$suggestions);
 +
+  $misspelled = iconv("iso-8859-2","UTF-8",$misspelled);
 +
 
 +
      $response = new ServerResponse();
 +
      $response->addParameter("misspelled", $misspelled);

Revision as of 13:51, 1 November 2007

How to add a new dictionary to aspell

  • Download the dictionary from gnu.org
  • Extract it from the tar file in a work directory
  • Set Zimbra's aspell in the path with:
PATH=/opt/zimbra/aspell-0.60.3/bin:$PATH
  • Configure and install the dictionary according to the README

./configure
make
make install

  • Edit the file /opt/zimbra/httpd/htdocs/aspell.php to reference the new dictionary. For example to add the french dictionary:

$dictionary = pspell_new("en_EN");
TO
$dictionary = pspell_new("fr_FR");


NOTE: I think is better to change the value of the variable $locale at the start of the file. NOTE: If you are using a non-english based language with special chars like tildes (spanish, for example), you have to modify aspell.php

This file is located at /opt/zimbra/httpd/htdocs/aspell.php. Replace this block (line 82 or so)

 $suggestions = implode(",", pspell_suggest($dictionary, $word));
 $misspelled .= "$word:$suggestions\n";

with this one:

 $suggestions = implode(",", pspell_suggest($dictionary, $word));
 $suggestions=iconv("iso-8859-1","UTF-8",$suggestions);
 $misspelled .= "$word:$suggestions\n";

NOTE: After changing the aspell language restart the spellchecker as the user zimbra with the following command:

 zmspellctl stop; zmspellctl start

There is also a problem when splitting words. Replace (line 48 or so)

$words = preg_split('/[^\w\'-] /', $text);

with this one:

$words = preg_split('/[^\w\'\xc0-\xfd-]+/', $text);

This regexp line should be enough for most western Europe languages (Spanish, French, German, Portuguese and Italian). It includes all ISO8859 europeean letters in the range 192-253 of the table below.

latin1.gif


  • To add a set of custom words see this How-To


Adding Latin-2 dictionary support

Many aspell dictionaries do not use UTF-8 encoding while UTF-8 is encoding of a choice for Zimbra suite. This means that text should be converted from UTF-8 to ISO-8859-x, spell checked ant then misspeled and sugested word should be translated back to UTF-8 for browser display. The following patch shows (Slovenian) modifications to Zimbra 4.5.9 spelling processor:

--- /opt/zimbra/httpd/htdocs/aspell.php.orig   2007-11-01 14:44:25.000000000 +0100
+++ /opt/zimbra/httpd/htdocs/aspell.php        2007-11-01 15:50:04.000000000 +0100
@@ -18,7 +18,7 @@
 
 $filename = "";
 $text = "";
-$locale = "en_EN";
+$locale = "sl_SI";
 
 if (isset($_FILES["text"])) {
     $text = file_get_contents($_FILES["text"]);
@@ -33,12 +33,17 @@
 if ($text != NULL) {
     setlocale(LC_ALL, $locale);
 
+    // Convert all text into 8-bit dictionary locale
+    $text=iconv("UTF-8", "iso-8859-2", $text);
+
     // Get rid of double-dashes, since we ignore dashes
     // when splitting words
     $text = preg_replace('/--+/', ' ', $text);
 
     // Split on anything that's not a word character, quote or dash
-    $words = preg_split('/[^\w\'-]+/', $text);
+    // $words = preg_split('/[^\w\'-]+/', $text);
+    // Do not split visible characters in the range of 0xA0-0xFF
+      $words = preg_split('/[^\w\'\xa0-\xff-]+/', $text);
 
     // Load dictionary
     $dictionary = pspell_new($locale);
@@ -79,14 +84,15 @@
         } else {
             $checked_words[$word] = 1;
         }
-
         // Check spelling
         if (!pspell_check($dictionary, $word)) {
             $suggestions = implode(",", pspell_suggest($dictionary, $word));
-            $suggestions = utf8_encode($suggestions);
             $misspelled .= "$word:$suggestions\n";
         }
     }
+   // Convert to dictionary locale
+   $suggestions=iconv("iso-8859-2","UTF-8",$suggestions);
+   $misspelled = iconv("iso-8859-2","UTF-8",$misspelled); 
 
     $response = new ServerResponse();
     $response->addParameter("misspelled", $misspelled);
Jump to: navigation, search