136 lines
2.8 KiB
C
136 lines
2.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/* https://cs.wikipedia.org/wiki/Wikipedie:Transkripce_hind%C5%A1tiny */
|
|
|
|
#include "compat.h"
|
|
#include "transcription.h"
|
|
#include "iast-czech.h"
|
|
#include "utf8.h"
|
|
|
|
#define CHUNKSIZE 1024
|
|
#define SCHWA_CHARACTER 'a'
|
|
|
|
static inline int is_consonant(unsigned int c)
|
|
{
|
|
return (c >= 0x0915 && c <= 0x0939);
|
|
}
|
|
|
|
static inline int is_nasal(unsigned int c)
|
|
{
|
|
return c == 0x0919 || c == 0x091e || c == 0x0923 ||
|
|
c == 0x0928 || c == 0x092e || c == 0x0902;
|
|
}
|
|
|
|
static void nasal_consonants_filter(char *latin, unsigned int *pos,
|
|
unsigned int prev, unsigned int c)
|
|
{
|
|
char *tail = latin + *pos - 1;
|
|
|
|
if (is_nasal(prev)) {
|
|
/* rewrite nasals before labials to 'm' */
|
|
switch (c) {
|
|
case 0x092b: /* ph */
|
|
case 0x092d: /* bh */
|
|
case 0x092a: /* p */
|
|
case 0x092c: /* b */
|
|
case 0x092e: /* m */
|
|
*tail = 'm';
|
|
break;
|
|
default:
|
|
if (is_consonant(c) && *tail != SCHWA_CHARACTER)
|
|
*tail = 'n';
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void end_of_word_filter(char *latin, unsigned int *pos,
|
|
unsigned int prev, unsigned int c)
|
|
{
|
|
unsigned int len;
|
|
|
|
if (is_devanagari(prev) && !is_devanagari(c)) {
|
|
|
|
/* shorten ending 'á' to 'a' */
|
|
if (prev == 0x093e) { /* modifier 'á' */
|
|
*pos = *pos - 1;
|
|
latin[*pos - 2] = 'a';
|
|
}
|
|
|
|
/* remove singular nominative suffix */
|
|
len = utf8_char_length(c);
|
|
if (prev == 0x094d && *(latin + *pos - 1 - len) == 'm') {
|
|
memmove(latin + *pos - 1 - len, latin + *pos - len, c);
|
|
*pos = *pos - 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
static struct translit_letter *letter_by_code(struct translit_letter *table,
|
|
unsigned int c)
|
|
{
|
|
while (table->code != 0) {
|
|
if (table->code == c)
|
|
return table;
|
|
table++;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
char *transcript_devanagari_to_czech(const char *devanagari)
|
|
{
|
|
struct translit_letter *table, *letter;
|
|
unsigned int c, prev = 0, alloc = 0, done = 0, len;
|
|
const char *src = devanagari;
|
|
char *latin = NULL;
|
|
|
|
table = get_iast_czech_transliteration_table();
|
|
|
|
while (1) {
|
|
if (alloc < done + UNICODE_MAX_LENGTH) {
|
|
latin = realloc(latin, alloc + CHUNKSIZE);
|
|
alloc += CHUNKSIZE;
|
|
}
|
|
|
|
c = utf8_unpack_char(src);
|
|
len = utf8_char_length(c);
|
|
src += len;
|
|
|
|
nasal_consonants_filter(latin, &done, prev, c);
|
|
|
|
letter = letter_by_code(table, c);
|
|
if (letter) {
|
|
switch (letter->type) {
|
|
case CONSONANT:
|
|
strcpy(latin + done, letter->data);
|
|
done += strlen(letter->data);
|
|
*(latin + done++) = SCHWA_CHARACTER;
|
|
break;
|
|
case VOWEL_SIGN:
|
|
if (done)
|
|
done--;
|
|
strcpy(latin + done, letter->data);
|
|
done += strlen(letter->data);
|
|
break;
|
|
default:
|
|
strcpy(latin + done, letter->data);
|
|
done += strlen(letter->data);
|
|
break;
|
|
}
|
|
} else {
|
|
utf8_pack_char(latin + done, c);
|
|
done += len;
|
|
}
|
|
|
|
end_of_word_filter(latin, &done, prev, c);
|
|
|
|
|
|
if (c == 0)
|
|
break;
|
|
prev = c;
|
|
}
|
|
|
|
*(latin + done - 1) = '\0';
|
|
|
|
return latin;
|
|
}
|