rewrite transliteration
This commit is contained in:
parent
7b16aa03e4
commit
ccf4994273
2 changed files with 115 additions and 136 deletions
114
tests/translit.c
114
tests/translit.c
|
@ -6,105 +6,81 @@
|
|||
#define ZWJ "\u200d"
|
||||
#define ZWNJ "\u200c"
|
||||
|
||||
static void test_transliterate_devanagari_to_latin(const char *devanagari,
|
||||
const char *latin)
|
||||
static void test_translit(const char *devanagari, const char *latin)
|
||||
{
|
||||
char *str;
|
||||
char *a, *b;
|
||||
int ret;
|
||||
|
||||
ret = transliterate_devanagari_to_latin(devanagari, &str);
|
||||
ret = transliterate_devanagari_to_latin(devanagari, &a);
|
||||
ck_assert_int_eq(0, ret);
|
||||
ck_assert_str_eq(latin, str);
|
||||
free(str);
|
||||
ck_assert_str_eq(latin, a);
|
||||
|
||||
ret = transliterate_latin_to_devanagari(a, &b);
|
||||
ck_assert_int_eq(0, ret);
|
||||
ck_assert_str_eq(devanagari, b);
|
||||
|
||||
free(a);
|
||||
free(b);
|
||||
}
|
||||
|
||||
static void test_transliterate_latin_to_devanagari(const char *latin,
|
||||
const char *devanagari)
|
||||
{
|
||||
char *str;
|
||||
int ret;
|
||||
|
||||
ret = transliterate_latin_to_devanagari(latin, &str);
|
||||
ck_assert_str_eq(devanagari, str);
|
||||
free(str);
|
||||
}
|
||||
|
||||
START_TEST(test_translit_devanagari_to_latin)
|
||||
START_TEST(test_translit_words)
|
||||
{
|
||||
/* https://en.wikipedia.org/wiki/Sanskrit */
|
||||
test_transliterate_devanagari_to_latin("संस्कृतम्", "saṃskṛtam");
|
||||
test_translit("संस्कृतम्", "saṃskṛtam");
|
||||
|
||||
/* https://en.wikipedia.org/wiki/Bhagavad_Gita */
|
||||
test_transliterate_devanagari_to_latin("भगवद्गीता", "bhagavadgītā");
|
||||
test_translit("भगवद्गीता", "bhagavadgītā");
|
||||
|
||||
/* https://en.wikipedia.org/wiki/%C4%80ry%C4%81varta */
|
||||
test_transliterate_devanagari_to_latin("आर्यावर्त", "āryāvarta");
|
||||
test_translit("आर्यावर्त", "āryāvarta");
|
||||
|
||||
/* https://en.wikipedia.org/wiki/Mahabharata */
|
||||
test_transliterate_devanagari_to_latin("महाभारतम्", "mahābhāratam");
|
||||
test_translit("महाभारतम्", "mahābhāratam");
|
||||
|
||||
/* https://en.wikipedia.org/wiki/Devanagari */
|
||||
test_transliterate_devanagari_to_latin("देवनागरी", "devanāgarī");
|
||||
|
||||
}
|
||||
END_TEST
|
||||
|
||||
START_TEST(test_translit_latin_to_devanagari)
|
||||
{
|
||||
test_transliterate_latin_to_devanagari("saṃskṛtam", "संस्कृतम्");
|
||||
test_transliterate_latin_to_devanagari("bhagavadgītā", "भगवद्गीता");
|
||||
test_transliterate_latin_to_devanagari("āryāvarta", "आर्यावर्त");
|
||||
test_transliterate_latin_to_devanagari("mahābhāratam", "महाभारतम्");
|
||||
test_transliterate_latin_to_devanagari("devanāgarī", "देवनागरी");
|
||||
}
|
||||
END_TEST
|
||||
|
||||
START_TEST(test_translit_lla_sylable)
|
||||
{
|
||||
test_transliterate_devanagari_to_latin("अग्निमीळे", "agnimīḷe");
|
||||
test_transliterate_latin_to_devanagari("agnimīḷe", "अग्निमीळे");
|
||||
|
||||
/* rigveda 1.22.17 */
|
||||
test_transliterate_devanagari_to_latin("समूळ्हमस्य", "samūḷhamasya");
|
||||
test_transliterate_latin_to_devanagari("samūḷhamasya", "समूळ्हमस्य");
|
||||
|
||||
test_transliterate_devanagari_to_latin("चाकॢप्र", "cākḷpra");
|
||||
test_transliterate_latin_to_devanagari("cākḷpra", "चाकॢप्र");
|
||||
}
|
||||
END_TEST
|
||||
|
||||
START_TEST(test_translit_candrabindu)
|
||||
{
|
||||
test_transliterate_devanagari_to_latin("तान्यजत्राँ", "tānyajatrām̐");
|
||||
test_transliterate_latin_to_devanagari("tānyajatrām̐", "तान्यजत्राँ");
|
||||
}
|
||||
END_TEST
|
||||
|
||||
START_TEST(test_translit_zero_width_joiner)
|
||||
{
|
||||
test_transliterate_devanagari_to_latin("शृ शृ", "ś"ZWJ"ṛ śṛ");
|
||||
test_transliterate_latin_to_devanagari("ś"ZWJ"ṛ śṛ", "शृ शृ");
|
||||
test_translit("देवनागरी", "devanāgarī");
|
||||
}
|
||||
END_TEST
|
||||
|
||||
START_TEST(test_translit_vedic)
|
||||
{
|
||||
/* rigveda 1.25.4 */
|
||||
test_transliterate_devanagari_to_latin("वस्यइष्टये", "vasya"ZWNJ"iṣṭaye");
|
||||
test_transliterate_latin_to_devanagari("vasya"ZWNJ"iṣṭaye", "वस्यइष्टये");
|
||||
test_translit("वस्यइष्टये", "vasya"ZWNJ"iṣṭaye");
|
||||
|
||||
/* rigveda 3.5.2 */
|
||||
test_transliterate_devanagari_to_latin("पूर्वीर्ऋतस्य", "pūrvīr"ZWNJ"ṛtasya");
|
||||
test_transliterate_latin_to_devanagari("pūrvīr"ZWNJ"ṛtasya", "पूर्वीर्ऋतस्य");
|
||||
test_translit("पूर्वीर्ऋतस्य", "pūrvīr"ZWNJ"ṛtasya");
|
||||
}
|
||||
END_TEST
|
||||
|
||||
START_TEST(test_translit_lla_sylable)
|
||||
{
|
||||
test_translit("अग्निमीळे", "agnimīḷe"); /* rigveda 1.1.1 */
|
||||
test_translit("चाकॢप्र", "cākḷpra"); /* rigveda 10.130.5 */
|
||||
test_translit("समूळ्हमस्य", "samūḷhamasya"); /* rigveda 1.22.17 */
|
||||
|
||||
test_translit("ऌ ऌप ऌळ", "ḷ ḷpa ḷḷa"); /* at the beginning = vowel */
|
||||
test_translit("ळ ळे ळृ", "ḷa ḷe ḷṛ"); /* followed by a vowel = consonant */
|
||||
test_translit("प्ळ पळ ओप्ळ", "pḷa paḷa opḷa");
|
||||
}
|
||||
END_TEST
|
||||
|
||||
START_TEST(test_translit_candrabindu)
|
||||
{
|
||||
test_translit("तान्यजत्राँ", "tānyajatrām̐");
|
||||
}
|
||||
END_TEST
|
||||
|
||||
START_TEST(test_translit_zero_width_joiner)
|
||||
{
|
||||
test_translit("शृ शृ", "ś"ZWJ"ṛ śṛ");
|
||||
}
|
||||
END_TEST
|
||||
|
||||
void register_translit_tests(TCase *test_case)
|
||||
{
|
||||
tcase_add_test(test_case, test_translit_devanagari_to_latin);
|
||||
tcase_add_test(test_case, test_translit_latin_to_devanagari);
|
||||
tcase_add_test(test_case, test_translit_words);
|
||||
tcase_add_test(test_case, test_translit_vedic);
|
||||
tcase_add_test(test_case, test_translit_lla_sylable);
|
||||
tcase_add_test(test_case, test_translit_candrabindu);
|
||||
tcase_add_test(test_case, test_translit_zero_width_joiner);
|
||||
tcase_add_test(test_case, test_translit_vedic);
|
||||
}
|
||||
|
|
|
@ -31,6 +31,12 @@ static struct translit_letter table[] = {
|
|||
{0x090f, VOWEL, "e"}, /* 13 */
|
||||
{0x0913, VOWEL, "o"}, /* 14 */
|
||||
|
||||
/* Codas */
|
||||
{0x0902, CODA, "\u1e43"}, /* anusvara (.m) */
|
||||
{0x0903, CODA, "\u1e25"}, /* visarga (.h) */
|
||||
{0x093d, CODA, "'"}, /* avagrada (') */
|
||||
{0x0901, CODA, "m\u0310"}, /* candrabindu */
|
||||
|
||||
/* Consonants */
|
||||
{0x0916, CONSONANT, "kh"}, /* 01 */
|
||||
{0x0918, CONSONANT, "gh"}, /* 02 */
|
||||
|
@ -67,12 +73,6 @@ static struct translit_letter table[] = {
|
|||
{0x0935, CONSONANT, "v"}, /* 33 */
|
||||
{0x0933, CONSONANT, "\u1e37"}, /* (.l) */
|
||||
|
||||
/* Codas */
|
||||
{0x0902, CODA, "\u1e43"}, /* anusvara (.m) */
|
||||
{0x0903, CODA, "\u1e25"}, /* visarga (.h) */
|
||||
{0x093d, CODA, "'"}, /* avagrada (') */
|
||||
{0x0901, CODA, "m\u0310"}, /* candrabindu */
|
||||
|
||||
/* Numbers */
|
||||
{0x0966, NUMBER, "0"},
|
||||
{0x0967, NUMBER, "1"},
|
||||
|
@ -118,6 +118,12 @@ static struct translit_letter *letter_by_code(unsigned int c)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
static inline int is_vowel_sign(unsigned int c)
|
||||
{
|
||||
struct translit_letter *letter = letter_by_code(c);
|
||||
return letter && letter->type == VOWEL_SIGN;
|
||||
}
|
||||
|
||||
int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
|
||||
{
|
||||
struct translit_letter *letter, *prev = NULL;
|
||||
|
@ -159,12 +165,10 @@ int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
|
|||
break;
|
||||
}
|
||||
} else {
|
||||
if (c == ZERO_WIDTH_JOINER) {
|
||||
/*
|
||||
* the zero width joiner joins consonants
|
||||
* so the inherent schwa has to be removed.
|
||||
*/
|
||||
if (done && *(latin + done - 1) == 'a') {
|
||||
if (c == ZERO_WIDTH_JOINER && done) {
|
||||
/* The ZWJ can substitute a virama so we need
|
||||
* to remove the inherent schwa. */
|
||||
if (is_vowel_sign(utf8_unpack_char(src))) {
|
||||
done--;
|
||||
}
|
||||
}
|
||||
|
@ -213,10 +217,15 @@ static struct translit_letter *vowel_sign_by_data(const char *data)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
#define PACK_LETTER(d, c) { \
|
||||
utf8_pack_char(d, c); \
|
||||
done += utf8_char_length(c); \
|
||||
}
|
||||
|
||||
int transliterate_latin_to_devanagari(const char *latin, char **ret)
|
||||
{
|
||||
struct translit_letter *letter = NULL, *next;
|
||||
unsigned int alloc = 0, done = 0, c, len;
|
||||
struct translit_letter *letter = NULL, *next, *last = NULL;
|
||||
unsigned int alloc = 0, done = 0, c = 0;
|
||||
const char *src = latin;
|
||||
char *devanagari = NULL;
|
||||
|
||||
|
@ -226,38 +235,13 @@ int transliterate_latin_to_devanagari(const char *latin, char **ret)
|
|||
alloc += CHUNKSIZE;
|
||||
}
|
||||
|
||||
/* consonant (.l) */
|
||||
if (strncmp(src, "\u1e37", 3) == 0) {
|
||||
letter = letter_by_data(src + 3);
|
||||
|
||||
if (letter) {
|
||||
utf8_pack_char(devanagari + done, 0x0933);
|
||||
done += 3;
|
||||
src += 3;
|
||||
if (letter->type == VOWEL) {
|
||||
goto encode_vowel_modifier;
|
||||
} else {
|
||||
utf8_pack_char(devanagari + done, VIRAMA);
|
||||
done += 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* candrabindu */
|
||||
if (strncmp(src, "m\u0310", 3) == 0) {
|
||||
utf8_pack_char(devanagari + done, 0x0901);
|
||||
done += 3;
|
||||
src += 3;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* zero-width non-joiner */
|
||||
if (strncmp(src, "\u200c", 3) == 0) {
|
||||
if (letter && (letter->code == VIRAMA || letter->type == CONSONANT)) {
|
||||
if (last) {
|
||||
/* not at the beginning of a word */
|
||||
next = letter_by_data(src + 3);
|
||||
if (next->type == CONSONANT) {
|
||||
utf8_pack_char(devanagari + done, 0x200c);
|
||||
done += 3;
|
||||
if (next && next->type == CONSONANT) {
|
||||
PACK_LETTER(devanagari + done, 0x200c);
|
||||
}
|
||||
}
|
||||
src += 3;
|
||||
|
@ -266,43 +250,62 @@ int transliterate_latin_to_devanagari(const char *latin, char **ret)
|
|||
|
||||
letter = letter_by_data(src);
|
||||
if (letter) {
|
||||
if (letter->code == 0x090c) { /* independent ‘.l’ vowel */
|
||||
next = letter_by_data(src + 3);
|
||||
if (last || (next && next->type == VOWEL && next->code != 0x090c)) {
|
||||
letter = letter_by_code(0x0933); /* .la */
|
||||
}
|
||||
}
|
||||
encode_consonant:
|
||||
/* A consonant or an initial vowel */
|
||||
utf8_pack_char(devanagari + done, letter->code);
|
||||
len = utf8_char_length(letter->code);
|
||||
done += len;
|
||||
PACK_LETTER(devanagari + done, letter->code);
|
||||
src += strlen(letter->data);
|
||||
|
||||
/* zero width joiner */
|
||||
c = utf8_unpack_char(src);
|
||||
if (c == ZERO_WIDTH_JOINER) {
|
||||
utf8_pack_char(devanagari + done, ZERO_WIDTH_JOINER);
|
||||
done += 3;
|
||||
if (c == ZERO_WIDTH_JOINER)
|
||||
src += 3;
|
||||
}
|
||||
|
||||
last = letter;
|
||||
|
||||
if (letter->type == VOWEL || letter->type == CODA)
|
||||
continue;
|
||||
encode_vowel_modifier:
|
||||
next = vowel_sign_by_data(src);
|
||||
if (next) {
|
||||
utf8_pack_char(devanagari + done, next->code);
|
||||
done += utf8_char_length(next->code);
|
||||
src += strlen(next->data);
|
||||
} else {
|
||||
if (*src == SCHWA_CHARACTER) {
|
||||
src++;
|
||||
} else {
|
||||
if (letter->type == CONSONANT) {
|
||||
utf8_pack_char(devanagari + done, VIRAMA);
|
||||
done += utf8_char_length(VIRAMA);
|
||||
|
||||
/* A vowel modifier (if any) */
|
||||
letter = vowel_sign_by_data(src);
|
||||
if (letter) {
|
||||
if (letter->code == 0x0962) {
|
||||
next = letter_by_data(src + 3);
|
||||
if (next && next->type == VOWEL) {
|
||||
/* consonant ‘.la’ */
|
||||
letter = letter_by_code(0x0933);
|
||||
if (last && last->type == CONSONANT)
|
||||
PACK_LETTER(devanagari + done, VIRAMA);
|
||||
|
||||
goto encode_consonant;
|
||||
}
|
||||
}
|
||||
|
||||
if (c == ZERO_WIDTH_JOINER)
|
||||
PACK_LETTER(devanagari + done, c);
|
||||
|
||||
PACK_LETTER(devanagari + done, letter->code);
|
||||
src += strlen(letter->data);
|
||||
|
||||
} else if (*src == SCHWA_CHARACTER) {
|
||||
src++;
|
||||
|
||||
} else {
|
||||
if (last->type == CONSONANT) {
|
||||
PACK_LETTER(devanagari + done, VIRAMA);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
c = utf8_unpack_char(src);
|
||||
len = utf8_char_length(c);
|
||||
utf8_pack_char(devanagari + done, c);
|
||||
done += len;
|
||||
src += len;
|
||||
PACK_LETTER(devanagari + done, c);
|
||||
src += utf8_char_length(c);
|
||||
last = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue