preserve the zero-width joiner character

This commit is contained in:
Vlasta Vesely 2021-04-21 21:22:11 +02:00
parent 000d33b7a2
commit e18edd3e24
No known key found for this signature in database
GPG key ID: EB0E649DC0DFCC22
4 changed files with 49 additions and 13 deletions

View file

@ -29,6 +29,8 @@ START_TEST(test_transcript_devanagari_to_czech)
test_transcript("अग्निमीळे", "agnimílé"); test_transcript("अग्निमीळे", "agnimílé");
test_transcript("तान्यजत्राँ", "tánjadžatrám"); test_transcript("तान्यजत्राँ", "tánjadžatrám");
test_transcript("श‍ृ शृ", "šr šr"); /* the zero width joiner */
} }
END_TEST END_TEST

View file

@ -73,6 +73,13 @@ START_TEST(test_translit_candrabindu)
} }
END_TEST END_TEST
START_TEST(test_translit_zero_width_joiner)
{
test_transliterate_devanagari_to_latin("श‍ृ शृ", "ś‍ṛ śṛ");
test_transliterate_latin_to_devanagari("ś‍ṛ śṛ", "श‍ृ शृ");
}
END_TEST
START_TEST(test_translit_detect_hindi) START_TEST(test_translit_detect_hindi)
{ {
char *hindi = NULL; char *hindi = NULL;
@ -87,5 +94,6 @@ void register_translit_tests(TCase *test_case)
tcase_add_test(test_case, test_translit_latin_to_devanagari); tcase_add_test(test_case, test_translit_latin_to_devanagari);
tcase_add_test(test_case, test_translit_lla_sylable); tcase_add_test(test_case, test_translit_lla_sylable);
tcase_add_test(test_case, test_translit_candrabindu); tcase_add_test(test_case, test_translit_candrabindu);
tcase_add_test(test_case, test_translit_zero_width_joiner);
tcase_add_test(test_case, test_translit_detect_hindi); tcase_add_test(test_case, test_translit_detect_hindi);
} }

View file

@ -6,10 +6,11 @@
#include "transliteration.h" #include "transliteration.h"
#include "utf8.h" #include "utf8.h"
#define SCHWA_CHARACTER 'a' #define SCHWA_CHARACTER 'a'
#define VIRAMA 0x094d #define ZERO_WIDTH_JOINER 0x200d
#define NUKTA 0x093c #define VIRAMA 0x094d
#define CHUNKSIZE 1024 #define NUKTA 0x093c
#define CHUNKSIZE 1024
static struct translit_letter table[] = { static struct translit_letter table[] = {
@ -189,13 +190,16 @@ int transcript_devanagari_to_czech(const char *devanagari, char **ret)
len = utf8_char_length(c); len = utf8_char_length(c);
src += len; src += len;
nasal_consonants_filter(latin, &done, prev, c); if (c == ZERO_WIDTH_JOINER)
continue;
if (c == NUKTA) { if (c == NUKTA) {
*ret = NULL; *ret = NULL;
return EHINDI; return EHINDI;
} }
nasal_consonants_filter(latin, &done, prev, c);
letter = letter_by_code(c); letter = letter_by_code(c);
if (letter) { if (letter) {
switch (letter->type) { switch (letter->type) {
@ -221,7 +225,6 @@ int transcript_devanagari_to_czech(const char *devanagari, char **ret)
end_of_word_filter(latin, &done, prev, c); end_of_word_filter(latin, &done, prev, c);
if (c == 0) if (c == 0)
break; break;
prev = c; prev = c;

View file

@ -5,10 +5,11 @@
#include "transliteration.h" #include "transliteration.h"
#include "utf8.h" #include "utf8.h"
#define SCHWA_CHARACTER 'a' #define SCHWA_CHARACTER 'a'
#define VIRAMA 0x094d #define ZERO_WIDTH_JOINER 0x200d
#define NUKTA 0x093c #define VIRAMA 0x094d
#define CHUNKSIZE 1024 #define NUKTA 0x093c
#define CHUNKSIZE 1024
static struct translit_letter table[] = { static struct translit_letter table[] = {
@ -149,7 +150,7 @@ int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
*(latin + done++) = SCHWA_CHARACTER; *(latin + done++) = SCHWA_CHARACTER;
break; break;
case VOWEL_SIGN: case VOWEL_SIGN:
if (done) { if (done && *(latin + done - 1) == 'a') {
/* delete the inherent schwa */ /* delete the inherent schwa */
done--; done--;
} }
@ -159,6 +160,16 @@ int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
break; break;
} }
} else { } else {
if (c == ZERO_WIDTH_JOINER) {
/*
* the zero width joiner joins consonants
* so the inherent schwa has to be removed.
*/
if (done && *(latin + done - 1) == 'a') {
done--;
}
}
utf8_pack_char(latin + done, c); utf8_pack_char(latin + done, c);
done += len; done += len;
} }
@ -204,7 +215,7 @@ static struct translit_letter *vowel_sign_by_data(const char *data)
int transliterate_latin_to_devanagari(const char *latin, char **ret) int transliterate_latin_to_devanagari(const char *latin, char **ret)
{ {
struct translit_letter *letter, *next; struct translit_letter *letter, *next;
unsigned int alloc = 0, done = 0, len; unsigned int alloc = 0, done = 0, c, len;
const char *src = latin; const char *src = latin;
char *devanagari = NULL; char *devanagari = NULL;
@ -240,6 +251,14 @@ int transliterate_latin_to_devanagari(const char *latin, char **ret)
done += len; done += len;
src += strlen(letter->data); src += strlen(letter->data);
/* zero width joiner */
c = utf8_unpack_char(src);
if (c == ZERO_WIDTH_JOINER) {
utf8_pack_char(devanagari + done, ZERO_WIDTH_JOINER);
src += 3;
done += 3;
}
if (letter->type == VOWEL || letter->type == CODA) if (letter->type == VOWEL || letter->type == CODA)
continue; continue;
encode_vowel_modifier: encode_vowel_modifier:
@ -259,7 +278,11 @@ encode_vowel_modifier:
} }
} }
} else { } else {
devanagari[done++] = *src++; c = utf8_unpack_char(src);
len = utf8_char_length(c);
utf8_pack_char(devanagari + done, c);
done += len;
src += len;
} }
} }