rewrite transliteration

This commit is contained in:
Vlasta Vesely 2021-12-28 21:06:41 +01:00
parent 7b16aa03e4
commit ccf4994273
No known key found for this signature in database
GPG key ID: EB0E649DC0DFCC22
2 changed files with 115 additions and 136 deletions

View file

@ -6,105 +6,81 @@
#define ZWJ "\u200d"
#define ZWNJ "\u200c"
static void test_transliterate_devanagari_to_latin(const char *devanagari,
const char *latin)
static void test_translit(const char *devanagari, const char *latin)
{
char *str;
char *a, *b;
int ret;
ret = transliterate_devanagari_to_latin(devanagari, &str);
ret = transliterate_devanagari_to_latin(devanagari, &a);
ck_assert_int_eq(0, ret);
ck_assert_str_eq(latin, str);
free(str);
ck_assert_str_eq(latin, a);
ret = transliterate_latin_to_devanagari(a, &b);
ck_assert_int_eq(0, ret);
ck_assert_str_eq(devanagari, b);
free(a);
free(b);
}
static void test_transliterate_latin_to_devanagari(const char *latin,
const char *devanagari)
{
char *str;
int ret;
ret = transliterate_latin_to_devanagari(latin, &str);
ck_assert_str_eq(devanagari, str);
free(str);
}
START_TEST(test_translit_devanagari_to_latin)
START_TEST(test_translit_words)
{
/* https://en.wikipedia.org/wiki/Sanskrit */
test_transliterate_devanagari_to_latin("संस्कृतम्", "saṃskṛtam");
test_translit("संस्कृतम्", "saṃskṛtam");
/* https://en.wikipedia.org/wiki/Bhagavad_Gita */
test_transliterate_devanagari_to_latin("भगवद्गीता", "bhagavadgītā");
test_translit("भगवद्गीता", "bhagavadgītā");
/* https://en.wikipedia.org/wiki/%C4%80ry%C4%81varta */
test_transliterate_devanagari_to_latin("आर्यावर्त", "āryāvarta");
test_translit("आर्यावर्त", "āryāvarta");
/* https://en.wikipedia.org/wiki/Mahabharata */
test_transliterate_devanagari_to_latin("महाभारतम्", "mahābhāratam");
test_translit("महाभारतम्", "mahābhāratam");
/* https://en.wikipedia.org/wiki/Devanagari */
test_transliterate_devanagari_to_latin("देवनागरी", "devanāgarī");
}
END_TEST
START_TEST(test_translit_latin_to_devanagari)
{
test_transliterate_latin_to_devanagari("saṃskṛtam", "संस्कृतम्");
test_transliterate_latin_to_devanagari("bhagavadgītā", "भगवद्गीता");
test_transliterate_latin_to_devanagari("āryāvarta", "आर्यावर्त");
test_transliterate_latin_to_devanagari("mahābhāratam", "महाभारतम्");
test_transliterate_latin_to_devanagari("devanāgarī", "देवनागरी");
}
END_TEST
START_TEST(test_translit_lla_sylable)
{
test_transliterate_devanagari_to_latin("अग्निमीळे", "agnimīḷe");
test_transliterate_latin_to_devanagari("agnimīḷe", "अग्निमीळे");
/* rigveda 1.22.17 */
test_transliterate_devanagari_to_latin("समूळ्हमस्य", "samūḷhamasya");
test_transliterate_latin_to_devanagari("samūḷhamasya", "समूळ्हमस्य");
test_transliterate_devanagari_to_latin("चाकॢप्र", "cākḷpra");
test_transliterate_latin_to_devanagari("cākḷpra", "चाकॢप्र");
}
END_TEST
START_TEST(test_translit_candrabindu)
{
test_transliterate_devanagari_to_latin("तान्यजत्राँ", "tānyajatrām̐");
test_transliterate_latin_to_devanagari("tānyajatrām̐", "तान्यजत्राँ");
}
END_TEST
START_TEST(test_translit_zero_width_joiner)
{
test_transliterate_devanagari_to_latin("श‍ृ शृ", "ś"ZWJ"ṛ śṛ");
test_transliterate_latin_to_devanagari("ś"ZWJ"ṛ śṛ", "श‍ृ शृ");
test_translit("देवनागरी", "devanāgarī");
}
END_TEST
START_TEST(test_translit_vedic)
{
/* rigveda 1.25.4 */
test_transliterate_devanagari_to_latin("वस्यइष्टये", "vasya"ZWNJ"iṣṭaye");
test_transliterate_latin_to_devanagari("vasya"ZWNJ"iṣṭaye", "वस्यइष्टये");
test_translit("वस्यइष्टये", "vasya"ZWNJ"iṣṭaye");
/* rigveda 3.5.2 */
test_transliterate_devanagari_to_latin("पूर्वीर्ऋतस्य", "pūrvīr"ZWNJ"ṛtasya");
test_transliterate_latin_to_devanagari("pūrvīr"ZWNJ"ṛtasya", "पूर्वीर्ऋतस्य");
test_translit("पूर्वीर्ऋतस्य", "pūrvīr"ZWNJ"ṛtasya");
}
END_TEST
START_TEST(test_translit_lla_sylable)
{
test_translit("अग्निमीळे", "agnimīḷe"); /* rigveda 1.1.1 */
test_translit("चाकॢप्र", "cākḷpra"); /* rigveda 10.130.5 */
test_translit("समूळ्हमस्य", "samūḷhamasya"); /* rigveda 1.22.17 */
test_translit("ऌ ऌप ऌळ", "ḷ ḷpa ḷḷa"); /* at the beginning = vowel */
test_translit("ळ ळे ळृ", "ḷa ḷe ḷṛ"); /* followed by a vowel = consonant */
test_translit("प्ळ पळ ओप्ळ", "pḷa paḷa opḷa");
}
END_TEST
START_TEST(test_translit_candrabindu)
{
test_translit("तान्यजत्राँ", "tānyajatrām̐");
}
END_TEST
START_TEST(test_translit_zero_width_joiner)
{
test_translit("श‍ृ शृ", "ś"ZWJ"ṛ śṛ");
}
END_TEST
void register_translit_tests(TCase *test_case)
{
tcase_add_test(test_case, test_translit_devanagari_to_latin);
tcase_add_test(test_case, test_translit_latin_to_devanagari);
tcase_add_test(test_case, test_translit_words);
tcase_add_test(test_case, test_translit_vedic);
tcase_add_test(test_case, test_translit_lla_sylable);
tcase_add_test(test_case, test_translit_candrabindu);
tcase_add_test(test_case, test_translit_zero_width_joiner);
tcase_add_test(test_case, test_translit_vedic);
}

View file

@ -31,6 +31,12 @@ static struct translit_letter table[] = {
{0x090f, VOWEL, "e"}, /* 13 */
{0x0913, VOWEL, "o"}, /* 14 */
/* Codas */
{0x0902, CODA, "\u1e43"}, /* anusvara (.m) */
{0x0903, CODA, "\u1e25"}, /* visarga (.h) */
{0x093d, CODA, "'"}, /* avagrada (') */
{0x0901, CODA, "m\u0310"}, /* candrabindu */
/* Consonants */
{0x0916, CONSONANT, "kh"}, /* 01 */
{0x0918, CONSONANT, "gh"}, /* 02 */
@ -67,12 +73,6 @@ static struct translit_letter table[] = {
{0x0935, CONSONANT, "v"}, /* 33 */
{0x0933, CONSONANT, "\u1e37"}, /* (.l) */
/* Codas */
{0x0902, CODA, "\u1e43"}, /* anusvara (.m) */
{0x0903, CODA, "\u1e25"}, /* visarga (.h) */
{0x093d, CODA, "'"}, /* avagrada (') */
{0x0901, CODA, "m\u0310"}, /* candrabindu */
/* Numbers */
{0x0966, NUMBER, "0"},
{0x0967, NUMBER, "1"},
@ -118,6 +118,12 @@ static struct translit_letter *letter_by_code(unsigned int c)
return NULL;
}
static inline int is_vowel_sign(unsigned int c)
{
struct translit_letter *letter = letter_by_code(c);
return letter && letter->type == VOWEL_SIGN;
}
int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
{
struct translit_letter *letter, *prev = NULL;
@ -159,12 +165,10 @@ int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
break;
}
} else {
if (c == ZERO_WIDTH_JOINER) {
/*
* the zero width joiner joins consonants
* so the inherent schwa has to be removed.
*/
if (done && *(latin + done - 1) == 'a') {
if (c == ZERO_WIDTH_JOINER && done) {
/* The ZWJ can substitute a virama so we need
* to remove the inherent schwa. */
if (is_vowel_sign(utf8_unpack_char(src))) {
done--;
}
}
@ -213,10 +217,15 @@ static struct translit_letter *vowel_sign_by_data(const char *data)
return NULL;
}
#define PACK_LETTER(d, c) { \
utf8_pack_char(d, c); \
done += utf8_char_length(c); \
}
int transliterate_latin_to_devanagari(const char *latin, char **ret)
{
struct translit_letter *letter = NULL, *next;
unsigned int alloc = 0, done = 0, c, len;
struct translit_letter *letter = NULL, *next, *last = NULL;
unsigned int alloc = 0, done = 0, c = 0;
const char *src = latin;
char *devanagari = NULL;
@ -226,38 +235,13 @@ int transliterate_latin_to_devanagari(const char *latin, char **ret)
alloc += CHUNKSIZE;
}
/* consonant (.l) */
if (strncmp(src, "\u1e37", 3) == 0) {
letter = letter_by_data(src + 3);
if (letter) {
utf8_pack_char(devanagari + done, 0x0933);
done += 3;
src += 3;
if (letter->type == VOWEL) {
goto encode_vowel_modifier;
} else {
utf8_pack_char(devanagari + done, VIRAMA);
done += 3;
}
}
}
/* candrabindu */
if (strncmp(src, "m\u0310", 3) == 0) {
utf8_pack_char(devanagari + done, 0x0901);
done += 3;
src += 3;
continue;
}
/* zero-width non-joiner */
if (strncmp(src, "\u200c", 3) == 0) {
if (letter && (letter->code == VIRAMA || letter->type == CONSONANT)) {
if (last) {
/* not at the beginning of a word */
next = letter_by_data(src + 3);
if (next->type == CONSONANT) {
utf8_pack_char(devanagari + done, 0x200c);
done += 3;
if (next && next->type == CONSONANT) {
PACK_LETTER(devanagari + done, 0x200c);
}
}
src += 3;
@ -266,43 +250,62 @@ int transliterate_latin_to_devanagari(const char *latin, char **ret)
letter = letter_by_data(src);
if (letter) {
if (letter->code == 0x090c) { /* independent .l vowel */
next = letter_by_data(src + 3);
if (last || (next && next->type == VOWEL && next->code != 0x090c)) {
letter = letter_by_code(0x0933); /* .la */
}
}
encode_consonant:
/* A consonant or an initial vowel */
utf8_pack_char(devanagari + done, letter->code);
len = utf8_char_length(letter->code);
done += len;
PACK_LETTER(devanagari + done, letter->code);
src += strlen(letter->data);
/* zero width joiner */
c = utf8_unpack_char(src);
if (c == ZERO_WIDTH_JOINER) {
utf8_pack_char(devanagari + done, ZERO_WIDTH_JOINER);
done += 3;
if (c == ZERO_WIDTH_JOINER)
src += 3;
}
last = letter;
if (letter->type == VOWEL || letter->type == CODA)
continue;
encode_vowel_modifier:
next = vowel_sign_by_data(src);
if (next) {
utf8_pack_char(devanagari + done, next->code);
done += utf8_char_length(next->code);
src += strlen(next->data);
} else {
if (*src == SCHWA_CHARACTER) {
src++;
} else {
if (letter->type == CONSONANT) {
utf8_pack_char(devanagari + done, VIRAMA);
done += utf8_char_length(VIRAMA);
/* A vowel modifier (if any) */
letter = vowel_sign_by_data(src);
if (letter) {
if (letter->code == 0x0962) {
next = letter_by_data(src + 3);
if (next && next->type == VOWEL) {
/* consonant .la */
letter = letter_by_code(0x0933);
if (last && last->type == CONSONANT)
PACK_LETTER(devanagari + done, VIRAMA);
goto encode_consonant;
}
}
if (c == ZERO_WIDTH_JOINER)
PACK_LETTER(devanagari + done, c);
PACK_LETTER(devanagari + done, letter->code);
src += strlen(letter->data);
} else if (*src == SCHWA_CHARACTER) {
src++;
} else {
if (last->type == CONSONANT) {
PACK_LETTER(devanagari + done, VIRAMA);
}
}
} else {
c = utf8_unpack_char(src);
len = utf8_char_length(c);
utf8_pack_char(devanagari + done, c);
done += len;
src += len;
PACK_LETTER(devanagari + done, c);
src += utf8_char_length(c);
last = NULL;
}
}