sanskrit-iast/transcription.c

/* SPDX-License-Identifier: GPL-2.0 */
/* https://cs.wikipedia.org/wiki/Wikipedie:Transkripce_hind%C5%A1tiny */

#include "compat.h"
#include "transcription.h"
#include "transliteration.h"
#include "utf8.h"

#define SCHWA_CHARACTER   'a'
#define ZERO_WIDTH_JOINER 0x200d
#define VIRAMA            0x094d
#define NUKTA             0x093c
#define CHUNKSIZE         1024

static struct translit_letter table[] = {

	/* Special characters */
	{0x0950, SPECIAL, "óm"},      /* aum */

	/* Vowels */
	{0x0910, VOWEL, "ai"},        /* 01 */
	{0x0914, VOWEL, "au"},        /* 02 */
	{0x0905, VOWEL, "a"},         /* 03 */
	{0x0906, VOWEL, "á"},         /* 04 */
	{0x0907, VOWEL, "i"},         /* 05 */
	{0x0908, VOWEL, "í"},         /* 06 */
	{0x0909, VOWEL, "u"},         /* 07 */
	{0x090a, VOWEL, "ú"},         /* 08 */
	{0x090b, VOWEL, "r"},         /* 09 */
	{0x0960, VOWEL, "r"},         /* 10 */
	{0x090c, VOWEL, "l"},         /* 11 */
	{0x0961, VOWEL, "l"},         /* 12 */
	{0x090f, VOWEL, "é"},         /* 13 */
	{0x0913, VOWEL, "ó"},         /* 14 */

	/* Consonants */
	{0x0916, CONSONANT, "kh"},    /* 01 */
	{0x0918, CONSONANT, "gh"},    /* 02 */
	{0x091b, CONSONANT, "čh"},    /* 03 */
	{0x091d, CONSONANT, "džh"},   /* 04 */
	{0x091c, CONSONANT, "dž"},    /* 05 */
	{0x0920, CONSONANT, "th"},    /* 06 */
	{0x0922, CONSONANT, "dh"},    /* 07 */
	{0x0925, CONSONANT, "th"},    /* 08 */
	{0x0927, CONSONANT, "dh"},    /* 09 */
	{0x092b, CONSONANT, "ph"},    /* 10 */
	{0x092d, CONSONANT, "bh"},    /* 11 */
	{0x0915, CONSONANT, "k"},     /* 12 */
	{0x0917, CONSONANT, "g"},     /* 13 */
	{0x0919, CONSONANT, "n"},     /* 14 */
	{0x0939, CONSONANT, "h"},     /* 15 */
	{0x091a, CONSONANT, "č"},     /* 16 */
	{0x091e, CONSONANT, "ň"},     /* 17 */
	{0x092f, CONSONANT, "j"},     /* 18 */
	{0x0936, CONSONANT, "š"},     /* 19 */
	{0x091F, CONSONANT, "t"},     /* 20 */
	{0x0921, CONSONANT, "d"},     /* 21 */
	{0x0923, CONSONANT, "n"},     /* 22 */
	{0x0930, CONSONANT, "r"},     /* 23 */
	{0x0937, CONSONANT, "š"},     /* 24 */
	{0x0924, CONSONANT, "t"},     /* 25 */
	{0x0926, CONSONANT, "d"},     /* 26 */
	{0x0928, CONSONANT, "n"},     /* 27 */
	{0x0932, CONSONANT, "l"},     /* 28 */
	{0x0938, CONSONANT, "s"},     /* 29 */
	{0x092a, CONSONANT, "p"},     /* 30 */
	{0x092c, CONSONANT, "b"},     /* 31 */
	{0x092e, CONSONANT, "m"},     /* 32 */
	{0x0935, CONSONANT, "v"},     /* 33 */
	{0x0933, CONSONANT, "l"},

	/* Codas */
	{0x0902, CODA, "m"},          /* anusvara */
	{0x0903, CODA, ""},           /* visarga  */
	{0x093d, CODA, "'"},          /* avagrada */
	{0x0901, CODA, "m"},          /* candrabindu */

	/* Numbers */
	{0x0966, NUMBER, "0"},
	{0x0967, NUMBER, "1"},
	{0x0968, NUMBER, "2"},
	{0x0969, NUMBER, "3"},
	{0x096a, NUMBER, "4"},
	{0x096b, NUMBER, "5"},
	{0x096c, NUMBER, "6"},
	{0x096d, NUMBER, "7"},
	{0x096e, NUMBER, "8"},
	{0x096f, NUMBER, "9"},

	/* Diacritic modifiers */
	{0x0948, VOWEL_SIGN, "ai"},
	{0x094c, VOWEL_SIGN, "au"},
	{0x093e, VOWEL_SIGN, "á"},
	{0x093f, VOWEL_SIGN, "i"},
	{0x0940, VOWEL_SIGN, "í"},
	{0x0941, VOWEL_SIGN, "u"},
	{0x0942, VOWEL_SIGN, "ú"},
	{0x0943, VOWEL_SIGN, "r"},
	{0x0944, VOWEL_SIGN, "r"},
	{0x0962, VOWEL_SIGN, "l"},
	{0x0963, VOWEL_SIGN, "l"},
	{0x0947, VOWEL_SIGN, "é"},
	{0x094b, VOWEL_SIGN, "ó"},
	{0x094d, VOWEL_SIGN, ""},     /* virama */
};

static inline int is_consonant(unsigned int c)
{
	return (c >= 0x0915 && c <= 0x0939);
}

static inline int is_nasal(unsigned int c)
{
	return c == 0x0919 || c == 0x091e || c == 0x0923 ||
	       c == 0x0928 || c == 0x092e || c == 0x0902;
}

static void nasal_consonants_filter(char *latin, unsigned int *pos,
				    unsigned int prev, unsigned int c)
{
	char *tail = latin + *pos - 1;

	if (is_nasal(prev)) {
		/* rewrite nasals before labials to 'm' */
		switch (c) {
		case 0x092b: /* ph */
		case 0x092d: /* bh */
		case 0x092a: /* p */
		case 0x092c: /* b */
		case 0x092e: /* m */
			*tail = 'm';
			break;
		default:
			if (is_consonant(c) && *tail != SCHWA_CHARACTER)
				*tail = 'n';
			break;
		}
	}
}

static void end_of_word_filter(char *latin, unsigned int *pos,
			       unsigned int prev, unsigned int c)
{
	unsigned int len;

	if (is_devanagari(prev) && !is_devanagari(c)) {

		/* shorten ending 'á' to 'a' */
		if (prev == 0x093e) { /* modifier 'á' */
			*pos = *pos - 1;
			latin[*pos - 2] = 'a';
		}

		/* remove singular nominative suffix */
		len = utf8_char_length(c);
		if (prev == VIRAMA && *(latin + *pos - 1 - len) == 'm') {
			memmove(latin + *pos - 1 - len, latin + *pos - len, c);
			*pos = *pos - 1;
		}
	}
}

static struct translit_letter *letter_by_code(unsigned int c)
{
	unsigned int i;

	for (i = 0; i < ARRAY_SIZE(table); i++) {
		if (table[i].code == c) {
			return table + i;
		}
	}

	return NULL;
}

int transcript_devanagari_to_czech(const char *devanagari, char **ret)
{
	struct translit_letter *letter;
	unsigned int c, prev = 0, alloc = 0, done = 0, len;
	const char *src = devanagari;
	char *latin = NULL;

	while (1) {
		if (alloc < done + UNICODE_MAX_LENGTH) {
			latin = realloc(latin, alloc + CHUNKSIZE);
			alloc += CHUNKSIZE;
		}

		c = utf8_unpack_char(src);
		len = utf8_char_length(c);
		src += len;

		if (c == ZERO_WIDTH_JOINER)
			continue;

		if (c == NUKTA) {
			*ret = NULL;
			return EHINDI;
		}

		nasal_consonants_filter(latin, &done, prev, c);

		letter = letter_by_code(c);
		if (letter) {
			switch (letter->type) {
			case CONSONANT:
				strcpy(latin + done, letter->data);
				done += strlen(letter->data);
				*(latin + done++) = SCHWA_CHARACTER;
				break;
			case VOWEL_SIGN:
				if (done) {
					/* delete the inherent schwa */
					done--;
				}
			default:
				strcpy(latin + done, letter->data);
				done += strlen(letter->data);
				break;
			}
		} else {
			utf8_pack_char(latin + done, c);
			done += len;
		}

		end_of_word_filter(latin, &done, prev, c);

		if (c == 0)
			break;
		prev = c;
	}

	*(latin + done - 1) = '\0';

	*ret = latin;

	return 0;
}