From 412be221a733a65057758906ba544fa115a9ce67 Mon Sep 17 00:00:00 2001 From: Vlasta Vesely Date: Wed, 1 Jan 2020 22:02:44 +0100 Subject: [PATCH] implement reverse transliteration (latin -> devanagari) --- iast.c | 6 ++-- tests/translit.c | 25 ++++++++++++++- transliteration.c | 82 +++++++++++++++++++++++++++++++++++++++++++++-- transliteration.h | 1 + 4 files changed, 108 insertions(+), 6 deletions(-) diff --git a/iast.c b/iast.c index b71ab52..005e801 100644 --- a/iast.c +++ b/iast.c @@ -6,6 +6,9 @@ static struct translit_letter table[] = { + /* Special characters */ + {0x0950, SPECIAL, "aum"}, /* aum */ + /* Vowels */ {0x0910, VOWEL, "ai"}, /* 01 */ {0x0914, VOWEL, "au"}, /* 02 */ @@ -62,9 +65,6 @@ static struct translit_letter table[] = { {0x0903, CODA, "\u1e25"}, /* visarga (.h) */ {0x093d, CODA, "'"}, /* avagrada (') */ - /* Special characters */ - {0x0950, SPECIAL, "aum"}, /* aum */ - /* Numbers */ {0x0966, NUMBER, "0"}, {0x0967, NUMBER, "1"}, diff --git a/tests/translit.c b/tests/translit.c index 78fa5a5..008d35a 100644 --- a/tests/translit.c +++ b/tests/translit.c @@ -1,7 +1,7 @@ +#include "test.h" #include "translit.h" #include "../transliteration.h" - START_TEST(test_translit_devanagari_to_latin) { char *latin; @@ -28,7 +28,30 @@ START_TEST(test_translit_devanagari_to_latin) } END_TEST +START_TEST(test_translit_latin_to_devanagari) +{ + char *devanagari; + + devanagari = transliterate_latin_to_devanagari("saṃskṛtam"); + ck_assert_str_eq("संस्कृतम्", devanagari); + free(devanagari); + + devanagari = transliterate_latin_to_devanagari("bhagavadgītā"); + ck_assert_str_eq("भगवद्गीता", devanagari); + free(devanagari); + + devanagari = transliterate_latin_to_devanagari("āryāvarta"); + ck_assert_str_eq("आर्यावर्त", devanagari); + free(devanagari); + + devanagari = transliterate_latin_to_devanagari("mahābhāratam"); + ck_assert_str_eq("महाभारतम्", devanagari); + free(devanagari); +} +END_TEST + void register_translit_tests(TCase *test_case) { tcase_add_test(test_case, test_translit_devanagari_to_latin); + tcase_add_test(test_case, test_translit_latin_to_devanagari); } diff --git a/transliteration.c b/transliteration.c index 2dc0066..49a022c 100644 --- a/transliteration.c +++ b/transliteration.c @@ -6,6 +6,7 @@ #include "utf8.h" #define SCHWA_CHARACTER 'a' +#define VIRAMA 0x094d #define CHUNKSIZE 1024 static struct translit_letter *letter_by_code(struct translit_letter *table, @@ -22,8 +23,7 @@ static struct translit_letter *letter_by_code(struct translit_letter *table, char *transliterate_devanagari_to_latin(const char *devanagari) { - struct translit_letter *table; - struct translit_letter *letter; + struct translit_letter *table, *letter; unsigned int c, alloc = 0, done = 0, len; const char *src = devanagari; char *latin = NULL; @@ -70,3 +70,81 @@ char *transliterate_devanagari_to_latin(const char *devanagari) return latin; } + +static struct translit_letter *letter_by_data(struct translit_letter *table, + const char *data) +{ + while (table->code != 0) { + unsigned int len = strlen(table->data); + if (len && strncmp(table->data, data, len) == 0) + return table; + table++; + } + + return NULL; +} + +static struct translit_letter *vowel_sign_by_data(struct translit_letter *table, + const char *data) +{ + while (table->code != 0) { + unsigned int len = strlen(table->data); + if (len && strncmp(table->data, data, len) == 0 && + table->type == VOWEL_SIGN) + return table; + table++; + } + + return NULL; +} + +char *transliterate_latin_to_devanagari(const char *latin) +{ + struct translit_letter *table, *letter, *next; + unsigned int alloc = 0, done = 0, len; + const char *src = latin; + char *devanagari = NULL; + + table = get_iast_transliteration_table(); + + while (*src) { + if (alloc < done + UNICODE_MAX_LENGTH) { + devanagari = realloc(devanagari, alloc + CHUNKSIZE); + alloc += CHUNKSIZE; + } + + letter = letter_by_data(table, src); + if (letter) { + utf8_pack_char(devanagari + done, letter->code); + len = utf8_char_length(letter->code); + done += len; + src += strlen(letter->data); + + if (letter->type == VOWEL || letter->type == CODA) + continue; + + next = vowel_sign_by_data(table, src); + if (next) { + utf8_pack_char(devanagari + done, next->code); + done += utf8_char_length(next->code); + src += strlen(next->data); + } else { + if (*src == SCHWA_CHARACTER) { + src++; + } else { + if (letter->type == CONSONANT) { + utf8_pack_char(devanagari + done, VIRAMA); + done += utf8_char_length(VIRAMA); + } + } + } + } else { + devanagari[done++] = *src++; + } + } + + if (devanagari) + devanagari[done] = '\0'; + + return devanagari; +} diff --git a/transliteration.h b/transliteration.h index 9e52b2f..cbdd17d 100644 --- a/transliteration.h +++ b/transliteration.h @@ -23,6 +23,7 @@ struct translit_context { }; char *transliterate_devanagari_to_latin(const char *text); +char *transliterate_latin_to_devanagari(const char *text); static inline int is_devanagari(unsigned int code) {