diff --git a/tests/translit.c b/tests/translit.c index ed6aa6b..41403ae 100644 --- a/tests/translit.c +++ b/tests/translit.c @@ -6,105 +6,81 @@ #define ZWJ "\u200d" #define ZWNJ "\u200c" -static void test_transliterate_devanagari_to_latin(const char *devanagari, - const char *latin) +static void test_translit(const char *devanagari, const char *latin) { - char *str; + char *a, *b; int ret; - ret = transliterate_devanagari_to_latin(devanagari, &str); + ret = transliterate_devanagari_to_latin(devanagari, &a); ck_assert_int_eq(0, ret); - ck_assert_str_eq(latin, str); - free(str); + ck_assert_str_eq(latin, a); + + ret = transliterate_latin_to_devanagari(a, &b); + ck_assert_int_eq(0, ret); + ck_assert_str_eq(devanagari, b); + + free(a); + free(b); } -static void test_transliterate_latin_to_devanagari(const char *latin, - const char *devanagari) -{ - char *str; - int ret; - - ret = transliterate_latin_to_devanagari(latin, &str); - ck_assert_str_eq(devanagari, str); - free(str); -} - -START_TEST(test_translit_devanagari_to_latin) +START_TEST(test_translit_words) { /* https://en.wikipedia.org/wiki/Sanskrit */ - test_transliterate_devanagari_to_latin("संस्कृतम्", "saṃskṛtam"); + test_translit("संस्कृतम्", "saṃskṛtam"); /* https://en.wikipedia.org/wiki/Bhagavad_Gita */ - test_transliterate_devanagari_to_latin("भगवद्गीता", "bhagavadgītā"); + test_translit("भगवद्गीता", "bhagavadgītā"); /* https://en.wikipedia.org/wiki/%C4%80ry%C4%81varta */ - test_transliterate_devanagari_to_latin("आर्यावर्त", "āryāvarta"); + test_translit("आर्यावर्त", "āryāvarta"); /* https://en.wikipedia.org/wiki/Mahabharata */ - test_transliterate_devanagari_to_latin("महाभारतम्", "mahābhāratam"); + test_translit("महाभारतम्", "mahābhāratam"); /* https://en.wikipedia.org/wiki/Devanagari */ - test_transliterate_devanagari_to_latin("देवनागरी", "devanāgarī"); - -} -END_TEST - -START_TEST(test_translit_latin_to_devanagari) -{ - test_transliterate_latin_to_devanagari("saṃskṛtam", "संस्कृतम्"); - test_transliterate_latin_to_devanagari("bhagavadgītā", "भगवद्गीता"); - test_transliterate_latin_to_devanagari("āryāvarta", "आर्यावर्त"); - test_transliterate_latin_to_devanagari("mahābhāratam", "महाभारतम्"); - test_transliterate_latin_to_devanagari("devanāgarī", "देवनागरी"); -} -END_TEST - -START_TEST(test_translit_lla_sylable) -{ - test_transliterate_devanagari_to_latin("अग्निमीळे", "agnimīḷe"); - test_transliterate_latin_to_devanagari("agnimīḷe", "अग्निमीळे"); - - /* rigveda 1.22.17 */ - test_transliterate_devanagari_to_latin("समूळ्हमस्य", "samūḷhamasya"); - test_transliterate_latin_to_devanagari("samūḷhamasya", "समूळ्हमस्य"); - - test_transliterate_devanagari_to_latin("चाकॢप्र", "cākḷpra"); - test_transliterate_latin_to_devanagari("cākḷpra", "चाकॢप्र"); -} -END_TEST - -START_TEST(test_translit_candrabindu) -{ - test_transliterate_devanagari_to_latin("तान्यजत्राँ", "tānyajatrām̐"); - test_transliterate_latin_to_devanagari("tānyajatrām̐", "तान्यजत्राँ"); -} -END_TEST - -START_TEST(test_translit_zero_width_joiner) -{ - test_transliterate_devanagari_to_latin("श‍ृ शृ", "ś"ZWJ"ṛ śṛ"); - test_transliterate_latin_to_devanagari("ś"ZWJ"ṛ śṛ", "श‍ृ शृ"); + test_translit("देवनागरी", "devanāgarī"); } END_TEST START_TEST(test_translit_vedic) { /* rigveda 1.25.4 */ - test_transliterate_devanagari_to_latin("वस्यइष्टये", "vasya"ZWNJ"iṣṭaye"); - test_transliterate_latin_to_devanagari("vasya"ZWNJ"iṣṭaye", "वस्यइष्टये"); + test_translit("वस्यइष्टये", "vasya"ZWNJ"iṣṭaye"); /* rigveda 3.5.2 */ - test_transliterate_devanagari_to_latin("पूर्वीर्ऋतस्य", "pūrvīr"ZWNJ"ṛtasya"); - test_transliterate_latin_to_devanagari("pūrvīr"ZWNJ"ṛtasya", "पूर्वीर्ऋतस्य"); + test_translit("पूर्वीर्ऋतस्य", "pūrvīr"ZWNJ"ṛtasya"); +} +END_TEST + +START_TEST(test_translit_lla_sylable) +{ + test_translit("अग्निमीळे", "agnimīḷe"); /* rigveda 1.1.1 */ + test_translit("चाकॢप्र", "cākḷpra"); /* rigveda 10.130.5 */ + test_translit("समूळ्हमस्य", "samūḷhamasya"); /* rigveda 1.22.17 */ + + test_translit("ऌ ऌप ऌळ", "ḷ ḷpa ḷḷa"); /* at the beginning = vowel */ + test_translit("ळ ळे ळृ", "ḷa ḷe ḷṛ"); /* followed by a vowel = consonant */ + test_translit("प्ळ पळ ओप्ळ", "pḷa paḷa opḷa"); +} +END_TEST + +START_TEST(test_translit_candrabindu) +{ + test_translit("तान्यजत्राँ", "tānyajatrām̐"); +} +END_TEST + +START_TEST(test_translit_zero_width_joiner) +{ + test_translit("श‍ृ शृ", "ś"ZWJ"ṛ śṛ"); } END_TEST void register_translit_tests(TCase *test_case) { - tcase_add_test(test_case, test_translit_devanagari_to_latin); - tcase_add_test(test_case, test_translit_latin_to_devanagari); + tcase_add_test(test_case, test_translit_words); + tcase_add_test(test_case, test_translit_vedic); tcase_add_test(test_case, test_translit_lla_sylable); tcase_add_test(test_case, test_translit_candrabindu); tcase_add_test(test_case, test_translit_zero_width_joiner); - tcase_add_test(test_case, test_translit_vedic); } diff --git a/transliteration.c b/transliteration.c index 2f8b1bc..a21a7be 100644 --- a/transliteration.c +++ b/transliteration.c @@ -31,6 +31,12 @@ static struct translit_letter table[] = { {0x090f, VOWEL, "e"}, /* 13 */ {0x0913, VOWEL, "o"}, /* 14 */ + /* Codas */ + {0x0902, CODA, "\u1e43"}, /* anusvara (.m) */ + {0x0903, CODA, "\u1e25"}, /* visarga (.h) */ + {0x093d, CODA, "'"}, /* avagrada (') */ + {0x0901, CODA, "m\u0310"}, /* candrabindu */ + /* Consonants */ {0x0916, CONSONANT, "kh"}, /* 01 */ {0x0918, CONSONANT, "gh"}, /* 02 */ @@ -67,12 +73,6 @@ static struct translit_letter table[] = { {0x0935, CONSONANT, "v"}, /* 33 */ {0x0933, CONSONANT, "\u1e37"}, /* (.l) */ - /* Codas */ - {0x0902, CODA, "\u1e43"}, /* anusvara (.m) */ - {0x0903, CODA, "\u1e25"}, /* visarga (.h) */ - {0x093d, CODA, "'"}, /* avagrada (') */ - {0x0901, CODA, "m\u0310"}, /* candrabindu */ - /* Numbers */ {0x0966, NUMBER, "0"}, {0x0967, NUMBER, "1"}, @@ -118,6 +118,12 @@ static struct translit_letter *letter_by_code(unsigned int c) return NULL; } +static inline int is_vowel_sign(unsigned int c) +{ + struct translit_letter *letter = letter_by_code(c); + return letter && letter->type == VOWEL_SIGN; +} + int transliterate_devanagari_to_latin(const char *devanagari, char **ret) { struct translit_letter *letter, *prev = NULL; @@ -159,12 +165,10 @@ int transliterate_devanagari_to_latin(const char *devanagari, char **ret) break; } } else { - if (c == ZERO_WIDTH_JOINER) { - /* - * the zero width joiner joins consonants - * so the inherent schwa has to be removed. - */ - if (done && *(latin + done - 1) == 'a') { + if (c == ZERO_WIDTH_JOINER && done) { + /* The ZWJ can substitute a virama so we need + * to remove the inherent schwa. */ + if (is_vowel_sign(utf8_unpack_char(src))) { done--; } } @@ -213,10 +217,15 @@ static struct translit_letter *vowel_sign_by_data(const char *data) return NULL; } +#define PACK_LETTER(d, c) { \ + utf8_pack_char(d, c); \ + done += utf8_char_length(c); \ +} + int transliterate_latin_to_devanagari(const char *latin, char **ret) { - struct translit_letter *letter = NULL, *next; - unsigned int alloc = 0, done = 0, c, len; + struct translit_letter *letter = NULL, *next, *last = NULL; + unsigned int alloc = 0, done = 0, c = 0; const char *src = latin; char *devanagari = NULL; @@ -226,38 +235,13 @@ int transliterate_latin_to_devanagari(const char *latin, char **ret) alloc += CHUNKSIZE; } - /* consonant (.l) */ - if (strncmp(src, "\u1e37", 3) == 0) { - letter = letter_by_data(src + 3); - - if (letter) { - utf8_pack_char(devanagari + done, 0x0933); - done += 3; - src += 3; - if (letter->type == VOWEL) { - goto encode_vowel_modifier; - } else { - utf8_pack_char(devanagari + done, VIRAMA); - done += 3; - } - } - } - - /* candrabindu */ - if (strncmp(src, "m\u0310", 3) == 0) { - utf8_pack_char(devanagari + done, 0x0901); - done += 3; - src += 3; - continue; - } - /* zero-width non-joiner */ if (strncmp(src, "\u200c", 3) == 0) { - if (letter && (letter->code == VIRAMA || letter->type == CONSONANT)) { + if (last) { + /* not at the beginning of a word */ next = letter_by_data(src + 3); - if (next->type == CONSONANT) { - utf8_pack_char(devanagari + done, 0x200c); - done += 3; + if (next && next->type == CONSONANT) { + PACK_LETTER(devanagari + done, 0x200c); } } src += 3; @@ -266,43 +250,62 @@ int transliterate_latin_to_devanagari(const char *latin, char **ret) letter = letter_by_data(src); if (letter) { + if (letter->code == 0x090c) { /* independent ‘.l’ vowel */ + next = letter_by_data(src + 3); + if (last || (next && next->type == VOWEL && next->code != 0x090c)) { + letter = letter_by_code(0x0933); /* .la */ + } + } +encode_consonant: + /* A consonant or an initial vowel */ utf8_pack_char(devanagari + done, letter->code); - len = utf8_char_length(letter->code); - done += len; + PACK_LETTER(devanagari + done, letter->code); src += strlen(letter->data); - /* zero width joiner */ c = utf8_unpack_char(src); - if (c == ZERO_WIDTH_JOINER) { - utf8_pack_char(devanagari + done, ZERO_WIDTH_JOINER); - done += 3; + if (c == ZERO_WIDTH_JOINER) src += 3; - } + + last = letter; if (letter->type == VOWEL || letter->type == CODA) continue; -encode_vowel_modifier: - next = vowel_sign_by_data(src); - if (next) { - utf8_pack_char(devanagari + done, next->code); - done += utf8_char_length(next->code); - src += strlen(next->data); - } else { - if (*src == SCHWA_CHARACTER) { - src++; - } else { - if (letter->type == CONSONANT) { - utf8_pack_char(devanagari + done, VIRAMA); - done += utf8_char_length(VIRAMA); + + /* A vowel modifier (if any) */ + letter = vowel_sign_by_data(src); + if (letter) { + if (letter->code == 0x0962) { + next = letter_by_data(src + 3); + if (next && next->type == VOWEL) { + /* consonant ‘.la’ */ + letter = letter_by_code(0x0933); + if (last && last->type == CONSONANT) + PACK_LETTER(devanagari + done, VIRAMA); + + goto encode_consonant; } } + + if (c == ZERO_WIDTH_JOINER) + PACK_LETTER(devanagari + done, c); + + PACK_LETTER(devanagari + done, letter->code); + src += strlen(letter->data); + + } else if (*src == SCHWA_CHARACTER) { + src++; + + } else { + if (last->type == CONSONANT) { + PACK_LETTER(devanagari + done, VIRAMA); + } } + } else { c = utf8_unpack_char(src); - len = utf8_char_length(c); - utf8_pack_char(devanagari + done, c); - done += len; - src += len; + PACK_LETTER(devanagari + done, c); + src += utf8_char_length(c); + last = NULL; } }