rewrite transliteration
This commit is contained in:
parent
7b16aa03e4
commit
ccf4994273
2 changed files with 115 additions and 136 deletions
114
tests/translit.c
114
tests/translit.c
|
@ -6,105 +6,81 @@
|
||||||
#define ZWJ "\u200d"
|
#define ZWJ "\u200d"
|
||||||
#define ZWNJ "\u200c"
|
#define ZWNJ "\u200c"
|
||||||
|
|
||||||
static void test_transliterate_devanagari_to_latin(const char *devanagari,
|
static void test_translit(const char *devanagari, const char *latin)
|
||||||
const char *latin)
|
|
||||||
{
|
{
|
||||||
char *str;
|
char *a, *b;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
ret = transliterate_devanagari_to_latin(devanagari, &str);
|
ret = transliterate_devanagari_to_latin(devanagari, &a);
|
||||||
ck_assert_int_eq(0, ret);
|
ck_assert_int_eq(0, ret);
|
||||||
ck_assert_str_eq(latin, str);
|
ck_assert_str_eq(latin, a);
|
||||||
free(str);
|
|
||||||
|
ret = transliterate_latin_to_devanagari(a, &b);
|
||||||
|
ck_assert_int_eq(0, ret);
|
||||||
|
ck_assert_str_eq(devanagari, b);
|
||||||
|
|
||||||
|
free(a);
|
||||||
|
free(b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_transliterate_latin_to_devanagari(const char *latin,
|
START_TEST(test_translit_words)
|
||||||
const char *devanagari)
|
|
||||||
{
|
|
||||||
char *str;
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
ret = transliterate_latin_to_devanagari(latin, &str);
|
|
||||||
ck_assert_str_eq(devanagari, str);
|
|
||||||
free(str);
|
|
||||||
}
|
|
||||||
|
|
||||||
START_TEST(test_translit_devanagari_to_latin)
|
|
||||||
{
|
{
|
||||||
/* https://en.wikipedia.org/wiki/Sanskrit */
|
/* https://en.wikipedia.org/wiki/Sanskrit */
|
||||||
test_transliterate_devanagari_to_latin("संस्कृतम्", "saṃskṛtam");
|
test_translit("संस्कृतम्", "saṃskṛtam");
|
||||||
|
|
||||||
/* https://en.wikipedia.org/wiki/Bhagavad_Gita */
|
/* https://en.wikipedia.org/wiki/Bhagavad_Gita */
|
||||||
test_transliterate_devanagari_to_latin("भगवद्गीता", "bhagavadgītā");
|
test_translit("भगवद्गीता", "bhagavadgītā");
|
||||||
|
|
||||||
/* https://en.wikipedia.org/wiki/%C4%80ry%C4%81varta */
|
/* https://en.wikipedia.org/wiki/%C4%80ry%C4%81varta */
|
||||||
test_transliterate_devanagari_to_latin("आर्यावर्त", "āryāvarta");
|
test_translit("आर्यावर्त", "āryāvarta");
|
||||||
|
|
||||||
/* https://en.wikipedia.org/wiki/Mahabharata */
|
/* https://en.wikipedia.org/wiki/Mahabharata */
|
||||||
test_transliterate_devanagari_to_latin("महाभारतम्", "mahābhāratam");
|
test_translit("महाभारतम्", "mahābhāratam");
|
||||||
|
|
||||||
/* https://en.wikipedia.org/wiki/Devanagari */
|
/* https://en.wikipedia.org/wiki/Devanagari */
|
||||||
test_transliterate_devanagari_to_latin("देवनागरी", "devanāgarī");
|
test_translit("देवनागरी", "devanāgarī");
|
||||||
|
|
||||||
}
|
|
||||||
END_TEST
|
|
||||||
|
|
||||||
START_TEST(test_translit_latin_to_devanagari)
|
|
||||||
{
|
|
||||||
test_transliterate_latin_to_devanagari("saṃskṛtam", "संस्कृतम्");
|
|
||||||
test_transliterate_latin_to_devanagari("bhagavadgītā", "भगवद्गीता");
|
|
||||||
test_transliterate_latin_to_devanagari("āryāvarta", "आर्यावर्त");
|
|
||||||
test_transliterate_latin_to_devanagari("mahābhāratam", "महाभारतम्");
|
|
||||||
test_transliterate_latin_to_devanagari("devanāgarī", "देवनागरी");
|
|
||||||
}
|
|
||||||
END_TEST
|
|
||||||
|
|
||||||
START_TEST(test_translit_lla_sylable)
|
|
||||||
{
|
|
||||||
test_transliterate_devanagari_to_latin("अग्निमीळे", "agnimīḷe");
|
|
||||||
test_transliterate_latin_to_devanagari("agnimīḷe", "अग्निमीळे");
|
|
||||||
|
|
||||||
/* rigveda 1.22.17 */
|
|
||||||
test_transliterate_devanagari_to_latin("समूळ्हमस्य", "samūḷhamasya");
|
|
||||||
test_transliterate_latin_to_devanagari("samūḷhamasya", "समूळ्हमस्य");
|
|
||||||
|
|
||||||
test_transliterate_devanagari_to_latin("चाकॢप्र", "cākḷpra");
|
|
||||||
test_transliterate_latin_to_devanagari("cākḷpra", "चाकॢप्र");
|
|
||||||
}
|
|
||||||
END_TEST
|
|
||||||
|
|
||||||
START_TEST(test_translit_candrabindu)
|
|
||||||
{
|
|
||||||
test_transliterate_devanagari_to_latin("तान्यजत्राँ", "tānyajatrām̐");
|
|
||||||
test_transliterate_latin_to_devanagari("tānyajatrām̐", "तान्यजत्राँ");
|
|
||||||
}
|
|
||||||
END_TEST
|
|
||||||
|
|
||||||
START_TEST(test_translit_zero_width_joiner)
|
|
||||||
{
|
|
||||||
test_transliterate_devanagari_to_latin("शृ शृ", "ś"ZWJ"ṛ śṛ");
|
|
||||||
test_transliterate_latin_to_devanagari("ś"ZWJ"ṛ śṛ", "शृ शृ");
|
|
||||||
}
|
}
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
START_TEST(test_translit_vedic)
|
START_TEST(test_translit_vedic)
|
||||||
{
|
{
|
||||||
/* rigveda 1.25.4 */
|
/* rigveda 1.25.4 */
|
||||||
test_transliterate_devanagari_to_latin("वस्यइष्टये", "vasya"ZWNJ"iṣṭaye");
|
test_translit("वस्यइष्टये", "vasya"ZWNJ"iṣṭaye");
|
||||||
test_transliterate_latin_to_devanagari("vasya"ZWNJ"iṣṭaye", "वस्यइष्टये");
|
|
||||||
|
|
||||||
/* rigveda 3.5.2 */
|
/* rigveda 3.5.2 */
|
||||||
test_transliterate_devanagari_to_latin("पूर्वीर्ऋतस्य", "pūrvīr"ZWNJ"ṛtasya");
|
test_translit("पूर्वीर्ऋतस्य", "pūrvīr"ZWNJ"ṛtasya");
|
||||||
test_transliterate_latin_to_devanagari("pūrvīr"ZWNJ"ṛtasya", "पूर्वीर्ऋतस्य");
|
}
|
||||||
|
END_TEST
|
||||||
|
|
||||||
|
START_TEST(test_translit_lla_sylable)
|
||||||
|
{
|
||||||
|
test_translit("अग्निमीळे", "agnimīḷe"); /* rigveda 1.1.1 */
|
||||||
|
test_translit("चाकॢप्र", "cākḷpra"); /* rigveda 10.130.5 */
|
||||||
|
test_translit("समूळ्हमस्य", "samūḷhamasya"); /* rigveda 1.22.17 */
|
||||||
|
|
||||||
|
test_translit("ऌ ऌप ऌळ", "ḷ ḷpa ḷḷa"); /* at the beginning = vowel */
|
||||||
|
test_translit("ळ ळे ळृ", "ḷa ḷe ḷṛ"); /* followed by a vowel = consonant */
|
||||||
|
test_translit("प्ळ पळ ओप्ळ", "pḷa paḷa opḷa");
|
||||||
|
}
|
||||||
|
END_TEST
|
||||||
|
|
||||||
|
START_TEST(test_translit_candrabindu)
|
||||||
|
{
|
||||||
|
test_translit("तान्यजत्राँ", "tānyajatrām̐");
|
||||||
|
}
|
||||||
|
END_TEST
|
||||||
|
|
||||||
|
START_TEST(test_translit_zero_width_joiner)
|
||||||
|
{
|
||||||
|
test_translit("शृ शृ", "ś"ZWJ"ṛ śṛ");
|
||||||
}
|
}
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
void register_translit_tests(TCase *test_case)
|
void register_translit_tests(TCase *test_case)
|
||||||
{
|
{
|
||||||
tcase_add_test(test_case, test_translit_devanagari_to_latin);
|
tcase_add_test(test_case, test_translit_words);
|
||||||
tcase_add_test(test_case, test_translit_latin_to_devanagari);
|
tcase_add_test(test_case, test_translit_vedic);
|
||||||
tcase_add_test(test_case, test_translit_lla_sylable);
|
tcase_add_test(test_case, test_translit_lla_sylable);
|
||||||
tcase_add_test(test_case, test_translit_candrabindu);
|
tcase_add_test(test_case, test_translit_candrabindu);
|
||||||
tcase_add_test(test_case, test_translit_zero_width_joiner);
|
tcase_add_test(test_case, test_translit_zero_width_joiner);
|
||||||
tcase_add_test(test_case, test_translit_vedic);
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,6 +31,12 @@ static struct translit_letter table[] = {
|
||||||
{0x090f, VOWEL, "e"}, /* 13 */
|
{0x090f, VOWEL, "e"}, /* 13 */
|
||||||
{0x0913, VOWEL, "o"}, /* 14 */
|
{0x0913, VOWEL, "o"}, /* 14 */
|
||||||
|
|
||||||
|
/* Codas */
|
||||||
|
{0x0902, CODA, "\u1e43"}, /* anusvara (.m) */
|
||||||
|
{0x0903, CODA, "\u1e25"}, /* visarga (.h) */
|
||||||
|
{0x093d, CODA, "'"}, /* avagrada (') */
|
||||||
|
{0x0901, CODA, "m\u0310"}, /* candrabindu */
|
||||||
|
|
||||||
/* Consonants */
|
/* Consonants */
|
||||||
{0x0916, CONSONANT, "kh"}, /* 01 */
|
{0x0916, CONSONANT, "kh"}, /* 01 */
|
||||||
{0x0918, CONSONANT, "gh"}, /* 02 */
|
{0x0918, CONSONANT, "gh"}, /* 02 */
|
||||||
|
@ -67,12 +73,6 @@ static struct translit_letter table[] = {
|
||||||
{0x0935, CONSONANT, "v"}, /* 33 */
|
{0x0935, CONSONANT, "v"}, /* 33 */
|
||||||
{0x0933, CONSONANT, "\u1e37"}, /* (.l) */
|
{0x0933, CONSONANT, "\u1e37"}, /* (.l) */
|
||||||
|
|
||||||
/* Codas */
|
|
||||||
{0x0902, CODA, "\u1e43"}, /* anusvara (.m) */
|
|
||||||
{0x0903, CODA, "\u1e25"}, /* visarga (.h) */
|
|
||||||
{0x093d, CODA, "'"}, /* avagrada (') */
|
|
||||||
{0x0901, CODA, "m\u0310"}, /* candrabindu */
|
|
||||||
|
|
||||||
/* Numbers */
|
/* Numbers */
|
||||||
{0x0966, NUMBER, "0"},
|
{0x0966, NUMBER, "0"},
|
||||||
{0x0967, NUMBER, "1"},
|
{0x0967, NUMBER, "1"},
|
||||||
|
@ -118,6 +118,12 @@ static struct translit_letter *letter_by_code(unsigned int c)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int is_vowel_sign(unsigned int c)
|
||||||
|
{
|
||||||
|
struct translit_letter *letter = letter_by_code(c);
|
||||||
|
return letter && letter->type == VOWEL_SIGN;
|
||||||
|
}
|
||||||
|
|
||||||
int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
|
int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
|
||||||
{
|
{
|
||||||
struct translit_letter *letter, *prev = NULL;
|
struct translit_letter *letter, *prev = NULL;
|
||||||
|
@ -159,12 +165,10 @@ int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (c == ZERO_WIDTH_JOINER) {
|
if (c == ZERO_WIDTH_JOINER && done) {
|
||||||
/*
|
/* The ZWJ can substitute a virama so we need
|
||||||
* the zero width joiner joins consonants
|
* to remove the inherent schwa. */
|
||||||
* so the inherent schwa has to be removed.
|
if (is_vowel_sign(utf8_unpack_char(src))) {
|
||||||
*/
|
|
||||||
if (done && *(latin + done - 1) == 'a') {
|
|
||||||
done--;
|
done--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -213,10 +217,15 @@ static struct translit_letter *vowel_sign_by_data(const char *data)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define PACK_LETTER(d, c) { \
|
||||||
|
utf8_pack_char(d, c); \
|
||||||
|
done += utf8_char_length(c); \
|
||||||
|
}
|
||||||
|
|
||||||
int transliterate_latin_to_devanagari(const char *latin, char **ret)
|
int transliterate_latin_to_devanagari(const char *latin, char **ret)
|
||||||
{
|
{
|
||||||
struct translit_letter *letter = NULL, *next;
|
struct translit_letter *letter = NULL, *next, *last = NULL;
|
||||||
unsigned int alloc = 0, done = 0, c, len;
|
unsigned int alloc = 0, done = 0, c = 0;
|
||||||
const char *src = latin;
|
const char *src = latin;
|
||||||
char *devanagari = NULL;
|
char *devanagari = NULL;
|
||||||
|
|
||||||
|
@ -226,38 +235,13 @@ int transliterate_latin_to_devanagari(const char *latin, char **ret)
|
||||||
alloc += CHUNKSIZE;
|
alloc += CHUNKSIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* consonant (.l) */
|
|
||||||
if (strncmp(src, "\u1e37", 3) == 0) {
|
|
||||||
letter = letter_by_data(src + 3);
|
|
||||||
|
|
||||||
if (letter) {
|
|
||||||
utf8_pack_char(devanagari + done, 0x0933);
|
|
||||||
done += 3;
|
|
||||||
src += 3;
|
|
||||||
if (letter->type == VOWEL) {
|
|
||||||
goto encode_vowel_modifier;
|
|
||||||
} else {
|
|
||||||
utf8_pack_char(devanagari + done, VIRAMA);
|
|
||||||
done += 3;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* candrabindu */
|
|
||||||
if (strncmp(src, "m\u0310", 3) == 0) {
|
|
||||||
utf8_pack_char(devanagari + done, 0x0901);
|
|
||||||
done += 3;
|
|
||||||
src += 3;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* zero-width non-joiner */
|
/* zero-width non-joiner */
|
||||||
if (strncmp(src, "\u200c", 3) == 0) {
|
if (strncmp(src, "\u200c", 3) == 0) {
|
||||||
if (letter && (letter->code == VIRAMA || letter->type == CONSONANT)) {
|
if (last) {
|
||||||
|
/* not at the beginning of a word */
|
||||||
next = letter_by_data(src + 3);
|
next = letter_by_data(src + 3);
|
||||||
if (next->type == CONSONANT) {
|
if (next && next->type == CONSONANT) {
|
||||||
utf8_pack_char(devanagari + done, 0x200c);
|
PACK_LETTER(devanagari + done, 0x200c);
|
||||||
done += 3;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
src += 3;
|
src += 3;
|
||||||
|
@ -266,43 +250,62 @@ int transliterate_latin_to_devanagari(const char *latin, char **ret)
|
||||||
|
|
||||||
letter = letter_by_data(src);
|
letter = letter_by_data(src);
|
||||||
if (letter) {
|
if (letter) {
|
||||||
|
if (letter->code == 0x090c) { /* independent ‘.l’ vowel */
|
||||||
|
next = letter_by_data(src + 3);
|
||||||
|
if (last || (next && next->type == VOWEL && next->code != 0x090c)) {
|
||||||
|
letter = letter_by_code(0x0933); /* .la */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
encode_consonant:
|
||||||
|
/* A consonant or an initial vowel */
|
||||||
utf8_pack_char(devanagari + done, letter->code);
|
utf8_pack_char(devanagari + done, letter->code);
|
||||||
len = utf8_char_length(letter->code);
|
PACK_LETTER(devanagari + done, letter->code);
|
||||||
done += len;
|
|
||||||
src += strlen(letter->data);
|
src += strlen(letter->data);
|
||||||
|
|
||||||
/* zero width joiner */
|
|
||||||
c = utf8_unpack_char(src);
|
c = utf8_unpack_char(src);
|
||||||
if (c == ZERO_WIDTH_JOINER) {
|
if (c == ZERO_WIDTH_JOINER)
|
||||||
utf8_pack_char(devanagari + done, ZERO_WIDTH_JOINER);
|
|
||||||
done += 3;
|
|
||||||
src += 3;
|
src += 3;
|
||||||
}
|
|
||||||
|
last = letter;
|
||||||
|
|
||||||
if (letter->type == VOWEL || letter->type == CODA)
|
if (letter->type == VOWEL || letter->type == CODA)
|
||||||
continue;
|
continue;
|
||||||
encode_vowel_modifier:
|
|
||||||
next = vowel_sign_by_data(src);
|
/* A vowel modifier (if any) */
|
||||||
if (next) {
|
letter = vowel_sign_by_data(src);
|
||||||
utf8_pack_char(devanagari + done, next->code);
|
if (letter) {
|
||||||
done += utf8_char_length(next->code);
|
if (letter->code == 0x0962) {
|
||||||
src += strlen(next->data);
|
next = letter_by_data(src + 3);
|
||||||
} else {
|
if (next && next->type == VOWEL) {
|
||||||
if (*src == SCHWA_CHARACTER) {
|
/* consonant ‘.la’ */
|
||||||
src++;
|
letter = letter_by_code(0x0933);
|
||||||
} else {
|
if (last && last->type == CONSONANT)
|
||||||
if (letter->type == CONSONANT) {
|
PACK_LETTER(devanagari + done, VIRAMA);
|
||||||
utf8_pack_char(devanagari + done, VIRAMA);
|
|
||||||
done += utf8_char_length(VIRAMA);
|
goto encode_consonant;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (c == ZERO_WIDTH_JOINER)
|
||||||
|
PACK_LETTER(devanagari + done, c);
|
||||||
|
|
||||||
|
PACK_LETTER(devanagari + done, letter->code);
|
||||||
|
src += strlen(letter->data);
|
||||||
|
|
||||||
|
} else if (*src == SCHWA_CHARACTER) {
|
||||||
|
src++;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
if (last->type == CONSONANT) {
|
||||||
|
PACK_LETTER(devanagari + done, VIRAMA);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
c = utf8_unpack_char(src);
|
c = utf8_unpack_char(src);
|
||||||
len = utf8_char_length(c);
|
PACK_LETTER(devanagari + done, c);
|
||||||
utf8_pack_char(devanagari + done, c);
|
src += utf8_char_length(c);
|
||||||
done += len;
|
last = NULL;
|
||||||
src += len;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue