implement reverse transliteration (latin -> devanagari)
This commit is contained in:
parent
f4d545f4d1
commit
412be221a7
4 changed files with 108 additions and 6 deletions
6
iast.c
6
iast.c
|
@ -6,6 +6,9 @@
|
||||||
|
|
||||||
static struct translit_letter table[] = {
|
static struct translit_letter table[] = {
|
||||||
|
|
||||||
|
/* Special characters */
|
||||||
|
{0x0950, SPECIAL, "aum"}, /* aum */
|
||||||
|
|
||||||
/* Vowels */
|
/* Vowels */
|
||||||
{0x0910, VOWEL, "ai"}, /* 01 */
|
{0x0910, VOWEL, "ai"}, /* 01 */
|
||||||
{0x0914, VOWEL, "au"}, /* 02 */
|
{0x0914, VOWEL, "au"}, /* 02 */
|
||||||
|
@ -62,9 +65,6 @@ static struct translit_letter table[] = {
|
||||||
{0x0903, CODA, "\u1e25"}, /* visarga (.h) */
|
{0x0903, CODA, "\u1e25"}, /* visarga (.h) */
|
||||||
{0x093d, CODA, "'"}, /* avagrada (') */
|
{0x093d, CODA, "'"}, /* avagrada (') */
|
||||||
|
|
||||||
/* Special characters */
|
|
||||||
{0x0950, SPECIAL, "aum"}, /* aum */
|
|
||||||
|
|
||||||
/* Numbers */
|
/* Numbers */
|
||||||
{0x0966, NUMBER, "0"},
|
{0x0966, NUMBER, "0"},
|
||||||
{0x0967, NUMBER, "1"},
|
{0x0967, NUMBER, "1"},
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
|
#include "test.h"
|
||||||
#include "translit.h"
|
#include "translit.h"
|
||||||
#include "../transliteration.h"
|
#include "../transliteration.h"
|
||||||
|
|
||||||
|
|
||||||
START_TEST(test_translit_devanagari_to_latin)
|
START_TEST(test_translit_devanagari_to_latin)
|
||||||
{
|
{
|
||||||
char *latin;
|
char *latin;
|
||||||
|
@ -28,7 +28,30 @@ START_TEST(test_translit_devanagari_to_latin)
|
||||||
}
|
}
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
|
START_TEST(test_translit_latin_to_devanagari)
|
||||||
|
{
|
||||||
|
char *devanagari;
|
||||||
|
|
||||||
|
devanagari = transliterate_latin_to_devanagari("saṃskṛtam");
|
||||||
|
ck_assert_str_eq("संस्कृतम्", devanagari);
|
||||||
|
free(devanagari);
|
||||||
|
|
||||||
|
devanagari = transliterate_latin_to_devanagari("bhagavadgītā");
|
||||||
|
ck_assert_str_eq("भगवद्गीता", devanagari);
|
||||||
|
free(devanagari);
|
||||||
|
|
||||||
|
devanagari = transliterate_latin_to_devanagari("āryāvarta");
|
||||||
|
ck_assert_str_eq("आर्यावर्त", devanagari);
|
||||||
|
free(devanagari);
|
||||||
|
|
||||||
|
devanagari = transliterate_latin_to_devanagari("mahābhāratam");
|
||||||
|
ck_assert_str_eq("महाभारतम्", devanagari);
|
||||||
|
free(devanagari);
|
||||||
|
}
|
||||||
|
END_TEST
|
||||||
|
|
||||||
void register_translit_tests(TCase *test_case)
|
void register_translit_tests(TCase *test_case)
|
||||||
{
|
{
|
||||||
tcase_add_test(test_case, test_translit_devanagari_to_latin);
|
tcase_add_test(test_case, test_translit_devanagari_to_latin);
|
||||||
|
tcase_add_test(test_case, test_translit_latin_to_devanagari);
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
#include "utf8.h"
|
#include "utf8.h"
|
||||||
|
|
||||||
#define SCHWA_CHARACTER 'a'
|
#define SCHWA_CHARACTER 'a'
|
||||||
|
#define VIRAMA 0x094d
|
||||||
#define CHUNKSIZE 1024
|
#define CHUNKSIZE 1024
|
||||||
|
|
||||||
static struct translit_letter *letter_by_code(struct translit_letter *table,
|
static struct translit_letter *letter_by_code(struct translit_letter *table,
|
||||||
|
@ -22,8 +23,7 @@ static struct translit_letter *letter_by_code(struct translit_letter *table,
|
||||||
|
|
||||||
char *transliterate_devanagari_to_latin(const char *devanagari)
|
char *transliterate_devanagari_to_latin(const char *devanagari)
|
||||||
{
|
{
|
||||||
struct translit_letter *table;
|
struct translit_letter *table, *letter;
|
||||||
struct translit_letter *letter;
|
|
||||||
unsigned int c, alloc = 0, done = 0, len;
|
unsigned int c, alloc = 0, done = 0, len;
|
||||||
const char *src = devanagari;
|
const char *src = devanagari;
|
||||||
char *latin = NULL;
|
char *latin = NULL;
|
||||||
|
@ -70,3 +70,81 @@ char *transliterate_devanagari_to_latin(const char *devanagari)
|
||||||
|
|
||||||
return latin;
|
return latin;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static struct translit_letter *letter_by_data(struct translit_letter *table,
|
||||||
|
const char *data)
|
||||||
|
{
|
||||||
|
while (table->code != 0) {
|
||||||
|
unsigned int len = strlen(table->data);
|
||||||
|
if (len && strncmp(table->data, data, len) == 0)
|
||||||
|
return table;
|
||||||
|
table++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct translit_letter *vowel_sign_by_data(struct translit_letter *table,
|
||||||
|
const char *data)
|
||||||
|
{
|
||||||
|
while (table->code != 0) {
|
||||||
|
unsigned int len = strlen(table->data);
|
||||||
|
if (len && strncmp(table->data, data, len) == 0 &&
|
||||||
|
table->type == VOWEL_SIGN)
|
||||||
|
return table;
|
||||||
|
table++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *transliterate_latin_to_devanagari(const char *latin)
|
||||||
|
{
|
||||||
|
struct translit_letter *table, *letter, *next;
|
||||||
|
unsigned int alloc = 0, done = 0, len;
|
||||||
|
const char *src = latin;
|
||||||
|
char *devanagari = NULL;
|
||||||
|
|
||||||
|
table = get_iast_transliteration_table();
|
||||||
|
|
||||||
|
while (*src) {
|
||||||
|
if (alloc < done + UNICODE_MAX_LENGTH) {
|
||||||
|
devanagari = realloc(devanagari, alloc + CHUNKSIZE);
|
||||||
|
alloc += CHUNKSIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
letter = letter_by_data(table, src);
|
||||||
|
if (letter) {
|
||||||
|
utf8_pack_char(devanagari + done, letter->code);
|
||||||
|
len = utf8_char_length(letter->code);
|
||||||
|
done += len;
|
||||||
|
src += strlen(letter->data);
|
||||||
|
|
||||||
|
if (letter->type == VOWEL || letter->type == CODA)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
next = vowel_sign_by_data(table, src);
|
||||||
|
if (next) {
|
||||||
|
utf8_pack_char(devanagari + done, next->code);
|
||||||
|
done += utf8_char_length(next->code);
|
||||||
|
src += strlen(next->data);
|
||||||
|
} else {
|
||||||
|
if (*src == SCHWA_CHARACTER) {
|
||||||
|
src++;
|
||||||
|
} else {
|
||||||
|
if (letter->type == CONSONANT) {
|
||||||
|
utf8_pack_char(devanagari + done, VIRAMA);
|
||||||
|
done += utf8_char_length(VIRAMA);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
devanagari[done++] = *src++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (devanagari)
|
||||||
|
devanagari[done] = '\0';
|
||||||
|
|
||||||
|
return devanagari;
|
||||||
|
}
|
||||||
|
|
|
@ -23,6 +23,7 @@ struct translit_context {
|
||||||
};
|
};
|
||||||
|
|
||||||
char *transliterate_devanagari_to_latin(const char *text);
|
char *transliterate_devanagari_to_latin(const char *text);
|
||||||
|
char *transliterate_latin_to_devanagari(const char *text);
|
||||||
|
|
||||||
static inline int is_devanagari(unsigned int code)
|
static inline int is_devanagari(unsigned int code)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Add table
Reference in a new issue