implement reverse transliteration (latin -> devanagari)

This commit is contained in:
Vlasta Vesely 2020-01-01 22:02:44 +01:00
parent f4d545f4d1
commit 412be221a7
4 changed files with 108 additions and 6 deletions

6
iast.c
View file

@ -6,6 +6,9 @@
static struct translit_letter table[] = {
/* Special characters */
{0x0950, SPECIAL, "aum"}, /* aum */
/* Vowels */
{0x0910, VOWEL, "ai"}, /* 01 */
{0x0914, VOWEL, "au"}, /* 02 */
@ -62,9 +65,6 @@ static struct translit_letter table[] = {
{0x0903, CODA, "\u1e25"}, /* visarga (.h) */
{0x093d, CODA, "'"}, /* avagrada (') */
/* Special characters */
{0x0950, SPECIAL, "aum"}, /* aum */
/* Numbers */
{0x0966, NUMBER, "0"},
{0x0967, NUMBER, "1"},

View file

@ -1,7 +1,7 @@
#include "test.h"
#include "translit.h"
#include "../transliteration.h"
START_TEST(test_translit_devanagari_to_latin)
{
char *latin;
@ -28,7 +28,30 @@ START_TEST(test_translit_devanagari_to_latin)
}
END_TEST
START_TEST(test_translit_latin_to_devanagari)
{
char *devanagari;
devanagari = transliterate_latin_to_devanagari("saṃskṛtam");
ck_assert_str_eq("संस्कृतम्", devanagari);
free(devanagari);
devanagari = transliterate_latin_to_devanagari("bhagavadgītā");
ck_assert_str_eq("भगवद्गीता", devanagari);
free(devanagari);
devanagari = transliterate_latin_to_devanagari("āryāvarta");
ck_assert_str_eq("आर्यावर्त", devanagari);
free(devanagari);
devanagari = transliterate_latin_to_devanagari("mahābhāratam");
ck_assert_str_eq("महाभारतम्", devanagari);
free(devanagari);
}
END_TEST
void register_translit_tests(TCase *test_case)
{
tcase_add_test(test_case, test_translit_devanagari_to_latin);
tcase_add_test(test_case, test_translit_latin_to_devanagari);
}

View file

@ -6,6 +6,7 @@
#include "utf8.h"
#define SCHWA_CHARACTER 'a'
#define VIRAMA 0x094d
#define CHUNKSIZE 1024
static struct translit_letter *letter_by_code(struct translit_letter *table,
@ -22,8 +23,7 @@ static struct translit_letter *letter_by_code(struct translit_letter *table,
char *transliterate_devanagari_to_latin(const char *devanagari)
{
struct translit_letter *table;
struct translit_letter *letter;
struct translit_letter *table, *letter;
unsigned int c, alloc = 0, done = 0, len;
const char *src = devanagari;
char *latin = NULL;
@ -70,3 +70,81 @@ char *transliterate_devanagari_to_latin(const char *devanagari)
return latin;
}
static struct translit_letter *letter_by_data(struct translit_letter *table,
const char *data)
{
while (table->code != 0) {
unsigned int len = strlen(table->data);
if (len && strncmp(table->data, data, len) == 0)
return table;
table++;
}
return NULL;
}
static struct translit_letter *vowel_sign_by_data(struct translit_letter *table,
const char *data)
{
while (table->code != 0) {
unsigned int len = strlen(table->data);
if (len && strncmp(table->data, data, len) == 0 &&
table->type == VOWEL_SIGN)
return table;
table++;
}
return NULL;
}
char *transliterate_latin_to_devanagari(const char *latin)
{
struct translit_letter *table, *letter, *next;
unsigned int alloc = 0, done = 0, len;
const char *src = latin;
char *devanagari = NULL;
table = get_iast_transliteration_table();
while (*src) {
if (alloc < done + UNICODE_MAX_LENGTH) {
devanagari = realloc(devanagari, alloc + CHUNKSIZE);
alloc += CHUNKSIZE;
}
letter = letter_by_data(table, src);
if (letter) {
utf8_pack_char(devanagari + done, letter->code);
len = utf8_char_length(letter->code);
done += len;
src += strlen(letter->data);
if (letter->type == VOWEL || letter->type == CODA)
continue;
next = vowel_sign_by_data(table, src);
if (next) {
utf8_pack_char(devanagari + done, next->code);
done += utf8_char_length(next->code);
src += strlen(next->data);
} else {
if (*src == SCHWA_CHARACTER) {
src++;
} else {
if (letter->type == CONSONANT) {
utf8_pack_char(devanagari + done, VIRAMA);
done += utf8_char_length(VIRAMA);
}
}
}
} else {
devanagari[done++] = *src++;
}
}
if (devanagari)
devanagari[done] = '\0';
return devanagari;
}

View file

@ -23,6 +23,7 @@ struct translit_context {
};
char *transliterate_devanagari_to_latin(const char *text);
char *transliterate_latin_to_devanagari(const char *text);
static inline int is_devanagari(unsigned int code)
{