preserve the zero-width joiner character
This commit is contained in:
parent
000d33b7a2
commit
e18edd3e24
4 changed files with 49 additions and 13 deletions
|
@ -29,6 +29,8 @@ START_TEST(test_transcript_devanagari_to_czech)
|
||||||
test_transcript("अग्निमीळे", "agnimílé");
|
test_transcript("अग्निमीळे", "agnimílé");
|
||||||
|
|
||||||
test_transcript("तान्यजत्राँ", "tánjadžatrám");
|
test_transcript("तान्यजत्राँ", "tánjadžatrám");
|
||||||
|
|
||||||
|
test_transcript("शृ शृ", "šr šr"); /* the zero width joiner */
|
||||||
}
|
}
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
|
|
|
@ -73,6 +73,13 @@ START_TEST(test_translit_candrabindu)
|
||||||
}
|
}
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
|
START_TEST(test_translit_zero_width_joiner)
|
||||||
|
{
|
||||||
|
test_transliterate_devanagari_to_latin("शृ शृ", "śṛ śṛ");
|
||||||
|
test_transliterate_latin_to_devanagari("śṛ śṛ", "शृ शृ");
|
||||||
|
}
|
||||||
|
END_TEST
|
||||||
|
|
||||||
START_TEST(test_translit_detect_hindi)
|
START_TEST(test_translit_detect_hindi)
|
||||||
{
|
{
|
||||||
char *hindi = NULL;
|
char *hindi = NULL;
|
||||||
|
@ -87,5 +94,6 @@ void register_translit_tests(TCase *test_case)
|
||||||
tcase_add_test(test_case, test_translit_latin_to_devanagari);
|
tcase_add_test(test_case, test_translit_latin_to_devanagari);
|
||||||
tcase_add_test(test_case, test_translit_lla_sylable);
|
tcase_add_test(test_case, test_translit_lla_sylable);
|
||||||
tcase_add_test(test_case, test_translit_candrabindu);
|
tcase_add_test(test_case, test_translit_candrabindu);
|
||||||
|
tcase_add_test(test_case, test_translit_zero_width_joiner);
|
||||||
tcase_add_test(test_case, test_translit_detect_hindi);
|
tcase_add_test(test_case, test_translit_detect_hindi);
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,10 +6,11 @@
|
||||||
#include "transliteration.h"
|
#include "transliteration.h"
|
||||||
#include "utf8.h"
|
#include "utf8.h"
|
||||||
|
|
||||||
#define SCHWA_CHARACTER 'a'
|
#define SCHWA_CHARACTER 'a'
|
||||||
#define VIRAMA 0x094d
|
#define ZERO_WIDTH_JOINER 0x200d
|
||||||
#define NUKTA 0x093c
|
#define VIRAMA 0x094d
|
||||||
#define CHUNKSIZE 1024
|
#define NUKTA 0x093c
|
||||||
|
#define CHUNKSIZE 1024
|
||||||
|
|
||||||
static struct translit_letter table[] = {
|
static struct translit_letter table[] = {
|
||||||
|
|
||||||
|
@ -189,13 +190,16 @@ int transcript_devanagari_to_czech(const char *devanagari, char **ret)
|
||||||
len = utf8_char_length(c);
|
len = utf8_char_length(c);
|
||||||
src += len;
|
src += len;
|
||||||
|
|
||||||
nasal_consonants_filter(latin, &done, prev, c);
|
if (c == ZERO_WIDTH_JOINER)
|
||||||
|
continue;
|
||||||
|
|
||||||
if (c == NUKTA) {
|
if (c == NUKTA) {
|
||||||
*ret = NULL;
|
*ret = NULL;
|
||||||
return EHINDI;
|
return EHINDI;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
nasal_consonants_filter(latin, &done, prev, c);
|
||||||
|
|
||||||
letter = letter_by_code(c);
|
letter = letter_by_code(c);
|
||||||
if (letter) {
|
if (letter) {
|
||||||
switch (letter->type) {
|
switch (letter->type) {
|
||||||
|
@ -221,7 +225,6 @@ int transcript_devanagari_to_czech(const char *devanagari, char **ret)
|
||||||
|
|
||||||
end_of_word_filter(latin, &done, prev, c);
|
end_of_word_filter(latin, &done, prev, c);
|
||||||
|
|
||||||
|
|
||||||
if (c == 0)
|
if (c == 0)
|
||||||
break;
|
break;
|
||||||
prev = c;
|
prev = c;
|
||||||
|
|
|
@ -5,10 +5,11 @@
|
||||||
#include "transliteration.h"
|
#include "transliteration.h"
|
||||||
#include "utf8.h"
|
#include "utf8.h"
|
||||||
|
|
||||||
#define SCHWA_CHARACTER 'a'
|
#define SCHWA_CHARACTER 'a'
|
||||||
#define VIRAMA 0x094d
|
#define ZERO_WIDTH_JOINER 0x200d
|
||||||
#define NUKTA 0x093c
|
#define VIRAMA 0x094d
|
||||||
#define CHUNKSIZE 1024
|
#define NUKTA 0x093c
|
||||||
|
#define CHUNKSIZE 1024
|
||||||
|
|
||||||
static struct translit_letter table[] = {
|
static struct translit_letter table[] = {
|
||||||
|
|
||||||
|
@ -149,7 +150,7 @@ int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
|
||||||
*(latin + done++) = SCHWA_CHARACTER;
|
*(latin + done++) = SCHWA_CHARACTER;
|
||||||
break;
|
break;
|
||||||
case VOWEL_SIGN:
|
case VOWEL_SIGN:
|
||||||
if (done) {
|
if (done && *(latin + done - 1) == 'a') {
|
||||||
/* delete the inherent schwa */
|
/* delete the inherent schwa */
|
||||||
done--;
|
done--;
|
||||||
}
|
}
|
||||||
|
@ -159,6 +160,16 @@ int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
if (c == ZERO_WIDTH_JOINER) {
|
||||||
|
/*
|
||||||
|
* the zero width joiner joins consonants
|
||||||
|
* so the inherent schwa has to be removed.
|
||||||
|
*/
|
||||||
|
if (done && *(latin + done - 1) == 'a') {
|
||||||
|
done--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
utf8_pack_char(latin + done, c);
|
utf8_pack_char(latin + done, c);
|
||||||
done += len;
|
done += len;
|
||||||
}
|
}
|
||||||
|
@ -204,7 +215,7 @@ static struct translit_letter *vowel_sign_by_data(const char *data)
|
||||||
int transliterate_latin_to_devanagari(const char *latin, char **ret)
|
int transliterate_latin_to_devanagari(const char *latin, char **ret)
|
||||||
{
|
{
|
||||||
struct translit_letter *letter, *next;
|
struct translit_letter *letter, *next;
|
||||||
unsigned int alloc = 0, done = 0, len;
|
unsigned int alloc = 0, done = 0, c, len;
|
||||||
const char *src = latin;
|
const char *src = latin;
|
||||||
char *devanagari = NULL;
|
char *devanagari = NULL;
|
||||||
|
|
||||||
|
@ -240,6 +251,14 @@ int transliterate_latin_to_devanagari(const char *latin, char **ret)
|
||||||
done += len;
|
done += len;
|
||||||
src += strlen(letter->data);
|
src += strlen(letter->data);
|
||||||
|
|
||||||
|
/* zero width joiner */
|
||||||
|
c = utf8_unpack_char(src);
|
||||||
|
if (c == ZERO_WIDTH_JOINER) {
|
||||||
|
utf8_pack_char(devanagari + done, ZERO_WIDTH_JOINER);
|
||||||
|
src += 3;
|
||||||
|
done += 3;
|
||||||
|
}
|
||||||
|
|
||||||
if (letter->type == VOWEL || letter->type == CODA)
|
if (letter->type == VOWEL || letter->type == CODA)
|
||||||
continue;
|
continue;
|
||||||
encode_vowel_modifier:
|
encode_vowel_modifier:
|
||||||
|
@ -259,7 +278,11 @@ encode_vowel_modifier:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
devanagari[done++] = *src++;
|
c = utf8_unpack_char(src);
|
||||||
|
len = utf8_char_length(c);
|
||||||
|
utf8_pack_char(devanagari + done, c);
|
||||||
|
done += len;
|
||||||
|
src += len;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue