Adding new dictionaries to aspell: Difference between revisions
Line 61: | Line 61: | ||
* To add a set of custom words see this [http://aspell.sourceforge.net/man-html/Creating-an-Individual-Word-List.html How-To] | * To add a set of custom words see this [http://aspell.sourceforge.net/man-html/Creating-an-Individual-Word-List.html How-To] | ||
=== Adding Latin-2 dictionary support === | |||
Many aspell dictionaries do not use UTF-8 encoding while UTF-8 is encoding of a choice for Zimbra suite. | |||
This means that text should be converted from UTF-8 to ISO-8859-x, spell checked ant then misspeled and sugested word should be | |||
translated back to UTF-8 for browser display. The following patch shows (Slovenian) modifications to Zimbra 4.5.9 spelling processor: | |||
--- /opt/zimbra/httpd/htdocs/aspell.php.orig 2007-11-01 14:44:25.000000000 +0100 | |||
+++ /opt/zimbra/httpd/htdocs/aspell.php 2007-11-01 15:50:04.000000000 +0100 | |||
@@ -18,7 +18,7 @@ | |||
$filename = ""; | |||
$text = ""; | |||
-$locale = "en_EN"; | |||
+$locale = "sl_SI"; | |||
if (isset($_FILES["text"])) { | |||
$text = file_get_contents($_FILES["text"]); | |||
@@ -33,12 +33,17 @@ | |||
if ($text != NULL) { | |||
setlocale(LC_ALL, $locale); | |||
+ // Convert all text into 8-bit dictionary locale | |||
+ $text=iconv("UTF-8", "iso-8859-2", $text); | |||
+ | |||
// Get rid of double-dashes, since we ignore dashes | |||
// when splitting words | |||
$text = preg_replace('/--+/', ' ', $text); | |||
// Split on anything that's not a word character, quote or dash | |||
- $words = preg_split('/[^\w\'-]+/', $text); | |||
+ // $words = preg_split('/[^\w\'-]+/', $text); | |||
+ // Do not split visible characters in the range of 0xA0-0xFF | |||
+ $words = preg_split('/[^\w\'\xa0-\xff-]+/', $text); | |||
// Load dictionary | |||
$dictionary = pspell_new($locale); | |||
@@ -79,14 +84,15 @@ | |||
} else { | |||
$checked_words[$word] = 1; | |||
} | |||
- | |||
// Check spelling | |||
if (!pspell_check($dictionary, $word)) { | |||
$suggestions = implode(",", pspell_suggest($dictionary, $word)); | |||
- $suggestions = utf8_encode($suggestions); | |||
$misspelled .= "$word:$suggestions\n"; | |||
} | |||
} | |||
+ // Convert to dictionary locale | |||
+ $suggestions=iconv("iso-8859-2","UTF-8",$suggestions); | |||
+ $misspelled = iconv("iso-8859-2","UTF-8",$misspelled); | |||
$response = new ServerResponse(); | |||
$response->addParameter("misspelled", $misspelled); |
Revision as of 13:51, 1 November 2007
How to add a new dictionary to aspell
- Download the dictionary from gnu.org
- Extract it from the tar file in a work directory
- Set Zimbra's aspell in the path with:
- PATH=/opt/zimbra/aspell-0.60.3/bin:$PATH
- Configure and install the dictionary according to the README
- ./configure
- make
- make install
- Edit the file /opt/zimbra/httpd/htdocs/aspell.php to reference the new dictionary. For example to add the french dictionary:
- $dictionary = pspell_new("en_EN");
- TO
- $dictionary = pspell_new("fr_FR");
NOTE: I think is better to change the value of the variable $locale at the start of the file.
NOTE: If you are using a non-english based language with special chars like tildes (spanish, for example), you have to modify aspell.php
This file is located at /opt/zimbra/httpd/htdocs/aspell.php. Replace this block (line 82 or so)
$suggestions = implode(",", pspell_suggest($dictionary, $word)); $misspelled .= "$word:$suggestions\n";
with this one:
$suggestions = implode(",", pspell_suggest($dictionary, $word)); $suggestions=iconv("iso-8859-1","UTF-8",$suggestions); $misspelled .= "$word:$suggestions\n";
NOTE: After changing the aspell language restart the spellchecker as the user zimbra with the following command:
zmspellctl stop; zmspellctl start
There is also a problem when splitting words. Replace (line 48 or so)
$words = preg_split('/[^\w\'-] /', $text);
with this one:
$words = preg_split('/[^\w\'\xc0-\xfd-]+/', $text);
This regexp line should be enough for most western Europe languages (Spanish, French, German, Portuguese and Italian). It includes all ISO8859 europeean letters in the range 192-253 of the table below.
- To add a set of custom words see this How-To
Adding Latin-2 dictionary support
Many aspell dictionaries do not use UTF-8 encoding while UTF-8 is encoding of a choice for Zimbra suite. This means that text should be converted from UTF-8 to ISO-8859-x, spell checked ant then misspeled and sugested word should be translated back to UTF-8 for browser display. The following patch shows (Slovenian) modifications to Zimbra 4.5.9 spelling processor:
--- /opt/zimbra/httpd/htdocs/aspell.php.orig 2007-11-01 14:44:25.000000000 +0100 +++ /opt/zimbra/httpd/htdocs/aspell.php 2007-11-01 15:50:04.000000000 +0100 @@ -18,7 +18,7 @@ $filename = ""; $text = ""; -$locale = "en_EN"; +$locale = "sl_SI"; if (isset($_FILES["text"])) { $text = file_get_contents($_FILES["text"]); @@ -33,12 +33,17 @@ if ($text != NULL) { setlocale(LC_ALL, $locale); + // Convert all text into 8-bit dictionary locale + $text=iconv("UTF-8", "iso-8859-2", $text); + // Get rid of double-dashes, since we ignore dashes // when splitting words $text = preg_replace('/--+/', ' ', $text); // Split on anything that's not a word character, quote or dash - $words = preg_split('/[^\w\'-]+/', $text); + // $words = preg_split('/[^\w\'-]+/', $text); + // Do not split visible characters in the range of 0xA0-0xFF + $words = preg_split('/[^\w\'\xa0-\xff-]+/', $text); // Load dictionary $dictionary = pspell_new($locale); @@ -79,14 +84,15 @@ } else { $checked_words[$word] = 1; } - // Check spelling if (!pspell_check($dictionary, $word)) { $suggestions = implode(",", pspell_suggest($dictionary, $word)); - $suggestions = utf8_encode($suggestions); $misspelled .= "$word:$suggestions\n"; } } + // Convert to dictionary locale + $suggestions=iconv("iso-8859-2","UTF-8",$suggestions); + $misspelled = iconv("iso-8859-2","UTF-8",$misspelled); $response = new ServerResponse(); $response->addParameter("misspelled", $misspelled);