sanskrit-iast/transcription.c

137 lines
2.8 KiB
C
Raw Permalink Normal View History

/* SPDX-License-Identifier: GPL-2.0 */
2020-01-02 16:32:03 +01:00
/* https://cs.wikipedia.org/wiki/Wikipedie:Transkripce_hind%C5%A1tiny */
#include "compat.h"
#include "transcription.h"
#include "iast-czech.h"
#include "utf8.h"
#define CHUNKSIZE 1024
#define SCHWA_CHARACTER 'a'
static inline int is_consonant(unsigned int c)
{
return (c >= 0x0915 && c <= 0x0939);
}
static inline int is_nasal(unsigned int c)
{
return c == 0x0919 || c == 0x091e || c == 0x0923 ||
c == 0x0928 || c == 0x092e || c == 0x0902;
}
static void nasal_consonants_filter(char *latin, unsigned int *pos,
unsigned int prev, unsigned int c)
{
char *tail = latin + *pos - 1;
if (is_nasal(prev)) {
/* rewrite nasals before labials to 'm' */
switch (c) {
case 0x092b: /* ph */
case 0x092d: /* bh */
case 0x092a: /* p */
case 0x092c: /* b */
case 0x092e: /* m */
*tail = 'm';
break;
default:
if (is_consonant(c) && *tail != SCHWA_CHARACTER)
*tail = 'n';
break;
}
}
}
static void end_of_word_filter(char *latin, unsigned int *pos,
unsigned int prev, unsigned int c)
{
unsigned int len;
if (is_devanagari(prev) && !is_devanagari(c)) {
/* shorten ending 'á' to 'a' */
if (prev == 0x093e) { /* modifier 'á' */
*pos = *pos - 1;
latin[*pos - 2] = 'a';
}
/* remove singular nominative suffix */
len = utf8_char_length(c);
if (prev == 0x094d && *(latin + *pos - 1 - len) == 'm') {
memmove(latin + *pos - 1 - len, latin + *pos - len, c);
*pos = *pos - 1;
}
}
}
static struct translit_letter *letter_by_code(struct translit_letter *table,
unsigned int c)
{
while (table->code != 0) {
if (table->code == c)
return table;
table++;
}
return NULL;
}
char *transcript_devanagari_to_czech(const char *devanagari)
{
struct translit_letter *table, *letter;
unsigned int c, prev = 0, alloc = 0, done = 0, len;
const char *src = devanagari;
char *latin = NULL;
table = get_iast_czech_transliteration_table();
while (1) {
if (alloc < done + UNICODE_MAX_LENGTH) {
latin = realloc(latin, alloc + CHUNKSIZE);
alloc += CHUNKSIZE;
}
c = utf8_unpack_char(src);
len = utf8_char_length(c);
src += len;
nasal_consonants_filter(latin, &done, prev, c);
letter = letter_by_code(table, c);
if (letter) {
switch (letter->type) {
case CONSONANT:
strcpy(latin + done, letter->data);
done += strlen(letter->data);
*(latin + done++) = SCHWA_CHARACTER;
break;
case VOWEL_SIGN:
if (done)
done--;
strcpy(latin + done, letter->data);
done += strlen(letter->data);
break;
default:
strcpy(latin + done, letter->data);
done += strlen(letter->data);
break;
}
} else {
utf8_pack_char(latin + done, c);
done += len;
}
end_of_word_filter(latin, &done, prev, c);
if (c == 0)
break;
prev = c;
}
*(latin + done - 1) = '\0';
return latin;
}