From e18edd3e249d8cc0e2f113f62ba5b017d590ec17 Mon Sep 17 00:00:00 2001 From: Vlasta Vesely Date: Wed, 21 Apr 2021 21:22:11 +0200 Subject: [PATCH] preserve the zero-width joiner character --- tests/transcript.c | 2 ++ tests/translit.c | 8 ++++++++ transcription.c | 15 +++++++++------ transliteration.c | 37 ++++++++++++++++++++++++++++++------- 4 files changed, 49 insertions(+), 13 deletions(-) diff --git a/tests/transcript.c b/tests/transcript.c index c62370d..93b5b24 100644 --- a/tests/transcript.c +++ b/tests/transcript.c @@ -29,6 +29,8 @@ START_TEST(test_transcript_devanagari_to_czech) test_transcript("अग्निमीळे", "agnimílé"); test_transcript("तान्यजत्राँ", "tánjadžatrám"); + + test_transcript("श‍ृ शृ", "šr šr"); /* the zero width joiner */ } END_TEST diff --git a/tests/translit.c b/tests/translit.c index 35c9720..ba85560 100644 --- a/tests/translit.c +++ b/tests/translit.c @@ -73,6 +73,13 @@ START_TEST(test_translit_candrabindu) } END_TEST +START_TEST(test_translit_zero_width_joiner) +{ + test_transliterate_devanagari_to_latin("श‍ृ शृ", "ś‍ṛ śṛ"); + test_transliterate_latin_to_devanagari("ś‍ṛ śṛ", "श‍ृ शृ"); +} +END_TEST + START_TEST(test_translit_detect_hindi) { char *hindi = NULL; @@ -87,5 +94,6 @@ void register_translit_tests(TCase *test_case) tcase_add_test(test_case, test_translit_latin_to_devanagari); tcase_add_test(test_case, test_translit_lla_sylable); tcase_add_test(test_case, test_translit_candrabindu); + tcase_add_test(test_case, test_translit_zero_width_joiner); tcase_add_test(test_case, test_translit_detect_hindi); } diff --git a/transcription.c b/transcription.c index 104f9a9..68049f2 100644 --- a/transcription.c +++ b/transcription.c @@ -6,10 +6,11 @@ #include "transliteration.h" #include "utf8.h" -#define SCHWA_CHARACTER 'a' -#define VIRAMA 0x094d -#define NUKTA 0x093c -#define CHUNKSIZE 1024 +#define SCHWA_CHARACTER 'a' +#define ZERO_WIDTH_JOINER 0x200d +#define VIRAMA 0x094d +#define NUKTA 0x093c +#define CHUNKSIZE 1024 static struct translit_letter table[] = { @@ -189,13 +190,16 @@ int transcript_devanagari_to_czech(const char *devanagari, char **ret) len = utf8_char_length(c); src += len; - nasal_consonants_filter(latin, &done, prev, c); + if (c == ZERO_WIDTH_JOINER) + continue; if (c == NUKTA) { *ret = NULL; return EHINDI; } + nasal_consonants_filter(latin, &done, prev, c); + letter = letter_by_code(c); if (letter) { switch (letter->type) { @@ -221,7 +225,6 @@ int transcript_devanagari_to_czech(const char *devanagari, char **ret) end_of_word_filter(latin, &done, prev, c); - if (c == 0) break; prev = c; diff --git a/transliteration.c b/transliteration.c index e550d4f..6dfc08d 100644 --- a/transliteration.c +++ b/transliteration.c @@ -5,10 +5,11 @@ #include "transliteration.h" #include "utf8.h" -#define SCHWA_CHARACTER 'a' -#define VIRAMA 0x094d -#define NUKTA 0x093c -#define CHUNKSIZE 1024 +#define SCHWA_CHARACTER 'a' +#define ZERO_WIDTH_JOINER 0x200d +#define VIRAMA 0x094d +#define NUKTA 0x093c +#define CHUNKSIZE 1024 static struct translit_letter table[] = { @@ -149,7 +150,7 @@ int transliterate_devanagari_to_latin(const char *devanagari, char **ret) *(latin + done++) = SCHWA_CHARACTER; break; case VOWEL_SIGN: - if (done) { + if (done && *(latin + done - 1) == 'a') { /* delete the inherent schwa */ done--; } @@ -159,6 +160,16 @@ int transliterate_devanagari_to_latin(const char *devanagari, char **ret) break; } } else { + if (c == ZERO_WIDTH_JOINER) { + /* + * the zero width joiner joins consonants + * so the inherent schwa has to be removed. + */ + if (done && *(latin + done - 1) == 'a') { + done--; + } + } + utf8_pack_char(latin + done, c); done += len; } @@ -204,7 +215,7 @@ static struct translit_letter *vowel_sign_by_data(const char *data) int transliterate_latin_to_devanagari(const char *latin, char **ret) { struct translit_letter *letter, *next; - unsigned int alloc = 0, done = 0, len; + unsigned int alloc = 0, done = 0, c, len; const char *src = latin; char *devanagari = NULL; @@ -240,6 +251,14 @@ int transliterate_latin_to_devanagari(const char *latin, char **ret) done += len; src += strlen(letter->data); + /* zero width joiner */ + c = utf8_unpack_char(src); + if (c == ZERO_WIDTH_JOINER) { + utf8_pack_char(devanagari + done, ZERO_WIDTH_JOINER); + src += 3; + done += 3; + } + if (letter->type == VOWEL || letter->type == CODA) continue; encode_vowel_modifier: @@ -259,7 +278,11 @@ encode_vowel_modifier: } } } else { - devanagari[done++] = *src++; + c = utf8_unpack_char(src); + len = utf8_char_length(c); + utf8_pack_char(devanagari + done, c); + done += len; + src += len; } }