reimplement czech transcription of sanskrit

2020-01-02 16:12:51 +01:00 · 2020-01-02 16:12:51 +01:00 · 9c3db8ebb8
commit 9c3db8ebb8
parent 412be221a7
8 changed files with 300 additions and 2 deletions
--- a/4
+++ b/4
@ -2,8 +2,8 @@ PREFIX=/usr

 .PHONY: main test install uninstall clean

-OBJECTS      = iast.o transliteration.o utf8.o encoder.o
-TEST_OBJECTS = tests/main.o tests/translit.o
+OBJECTS      = iast.o iast-czech.o transliteration.o transcription.o utf8.o encoder.o
+TEST_OBJECTS = tests/main.o tests/translit.o tests/transcript.o
 CFLAGS       = -Wall
 LIBS         =
 TEST_CFLAGS  = $(CFLAGS) $(shell pkg-config --cflags check)
--- a/iast-czech.c
+++ b/iast-czech.c
@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "compat.h"
+#include "iast-czech.h"
+
+static struct translit_letter table[] = {
+
+	/* Special characters */
+	{0x0950, SPECIAL, "óm"},      /* aum */
+
+	/* Vowels */
+	{0x0910, VOWEL, "ai"},        /* 01 */
+	{0x0914, VOWEL, "au"},        /* 02 */
+	{0x0905, VOWEL, "a"},         /* 03 */
+	{0x0906, VOWEL, "á"},         /* 04 */
+	{0x0907, VOWEL, "i"},         /* 05 */
+	{0x0908, VOWEL, "í"},         /* 06 */
+	{0x0909, VOWEL, "u"},         /* 07 */
+	{0x090a, VOWEL, "ú"},         /* 08 */
+	{0x090b, VOWEL, "r"},         /* 09 */
+	{0x0960, VOWEL, "r"},         /* 10 */
+	{0x090c, VOWEL, "l"},         /* 11 */
+	{0x0961, VOWEL, "l"},         /* 12 */
+	{0x090f, VOWEL, "é"},         /* 13 */
+	{0x0913, VOWEL, "ó"},         /* 14 */
+
+	/* Consonants */
+	{0x0916, CONSONANT, "kh"},    /* 01 */
+	{0x0918, CONSONANT, "gh"},    /* 02 */
+	{0x091b, CONSONANT, "ch"},    /* 03 */
+	{0x091d, CONSONANT, "džh"},   /* 04 */
+	{0x091c, CONSONANT, "dž"},    /* 05 */
+	{0x0920, CONSONANT, "th"},    /* 06 */
+	{0x0922, CONSONANT, "dh"},    /* 07 */
+	{0x0925, CONSONANT, "th"},    /* 08 */
+	{0x0927, CONSONANT, "dh"},    /* 09 */
+	{0x092b, CONSONANT, "ph"},    /* 10 */
+	{0x092d, CONSONANT, "bh"},    /* 11 */
+	{0x0915, CONSONANT, "k"},     /* 12 */
+	{0x0917, CONSONANT, "g"},     /* 13 */
+	{0x0919, CONSONANT, "n"},     /* 14 */
+	{0x0939, CONSONANT, "h"},     /* 15 */
+	{0x091a, CONSONANT, "c"},     /* 16 */
+	{0x091e, CONSONANT, "ň"},     /* 17 */
+	{0x092f, CONSONANT, "j"},     /* 18 */
+	{0x0936, CONSONANT, "š"},     /* 19 */
+	{0x091F, CONSONANT, "t"},     /* 20 */
+	{0x0921, CONSONANT, "d"},     /* 21 */
+	{0x0923, CONSONANT, "n"},     /* 22 */
+	{0x0930, CONSONANT, "r"},     /* 23 */
+	{0x0937, CONSONANT, "š"},     /* 24 */
+	{0x0924, CONSONANT, "t"},     /* 25 */
+	{0x0926, CONSONANT, "d"},     /* 26 */
+	{0x0928, CONSONANT, "n"},     /* 27 */
+	{0x0932, CONSONANT, "l"},     /* 28 */
+	{0x0938, CONSONANT, "s"},     /* 29 */
+	{0x092a, CONSONANT, "p"},     /* 30 */
+	{0x092c, CONSONANT, "b"},     /* 31 */
+	{0x092e, CONSONANT, "m"},     /* 32 */
+	{0x0935, CONSONANT, "v"},     /* 33 */
+
+	/* Codas */
+	{0x0902, CODA, "m"},          /* anusvara */
+	{0x0903, CODA, ""},           /* visarga  */
+	{0x093d, CODA, "'"},          /* avagrada */
+
+	/* Numbers */
+	{0x0966, NUMBER, "0"},
+	{0x0967, NUMBER, "1"},
+	{0x0968, NUMBER, "2"},
+	{0x0969, NUMBER, "3"},
+	{0x096a, NUMBER, "4"},
+	{0x096b, NUMBER, "5"},
+	{0x096c, NUMBER, "6"},
+	{0x096d, NUMBER, "7"},
+	{0x096e, NUMBER, "8"},
+	{0x096f, NUMBER, "9"},
+
+	/* Diacritic modifiers */
+	{0x0948, VOWEL_SIGN, "ai"},
+	{0x094c, VOWEL_SIGN, "au"},
+	{0x093e, VOWEL_SIGN, "á"},
+	{0x093f, VOWEL_SIGN, "i"},
+	{0x0940, VOWEL_SIGN, "í"},
+	{0x0941, VOWEL_SIGN, "u"},
+	{0x0942, VOWEL_SIGN, "ú"},
+	{0x0943, VOWEL_SIGN, "r"},
+	{0x0944, VOWEL_SIGN, "r"},
+	{0x0962, VOWEL_SIGN, "l"},
+	{0x0963, VOWEL_SIGN, "l"},
+	{0x0947, VOWEL_SIGN, "é"},
+	{0x094b, VOWEL_SIGN, "ó"},
+	{0x094d, VOWEL_SIGN, ""},     /* virama */
+
+	{0, 0, NULL}
+};
+
+struct translit_letter *get_iast_czech_transliteration_table()
+{
+	return table;
+}
--- a/iast-czech.h
+++ b/iast-czech.h
@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __IAST_CZECH_H
+#define __IAST_CZECH_H
+
+#include "transliteration.h"
+
+struct translit_letter *get_iast_czech_transliteration_table();
+
+#endif /* __IAST_CZECH_H */
--- a/tests/main.c
+++ b/tests/main.c
@ -1,5 +1,6 @@
 #include "test.h"
 #include "translit.h"
+#include "transcript.h"

 static Suite *create_test_suite()
 {
@ -10,6 +11,7 @@ static Suite *create_test_suite()
 	test_case = tcase_create(NULL);

 	register_translit_tests(test_case);
+	register_transcript_tests(test_case);
 	suite_add_tcase(suite, test_case);

 	return suite;
--- a/tests/transcript.c
+++ b/tests/transcript.c
@ -0,0 +1,34 @@
+#include "test.h"
+#include "transcript.h"
+#include "../transcription.h"
+
+START_TEST(test_transcript_devanagari_to_czech)
+{
+	char *czech;
+
+	czech = transcript_devanagari_to_czech("तन्त्रशास्त्रम्");
+	ck_assert_str_eq("tantrašástra", czech);
+	free(czech);
+
+	czech = transcript_devanagari_to_czech("सांख्य");
+	ck_assert_str_eq("sánkhja", czech);
+	free(czech);
+
+	czech = transcript_devanagari_to_czech("महाभारतम्");
+	ck_assert_str_eq("mahábhárata", czech);
+	free(czech);
+
+	czech = transcript_devanagari_to_czech("योगः");
+	ck_assert_str_eq("jóga", czech);
+	free(czech);
+
+	czech = transcript_devanagari_to_czech("भगवद्गीता");
+	ck_assert_str_eq("bhagavadgíta", czech);
+	free(czech);
+}
+END_TEST
+
+void register_transcript_tests(TCase *test_case)
+{
+	tcase_add_test(test_case, test_transcript_devanagari_to_czech);
+}
--- a/tests/transcript.h
+++ b/tests/transcript.h
@ -0,0 +1,8 @@
+#ifndef __TEST_TRANSCRIPT_H
+#define __TEST_TRANSCRIPT_H
+
+#include <check.h>
+
+void register_transcript_tests(TCase *test_case);
+
+#endif /* __TEST_TRANSCRIPT_H */
--- a/transcription.c
+++ b/transcription.c
@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "compat.h"
+#include "transcription.h"
+#include "iast-czech.h"
+#include "utf8.h"
+
+#define CHUNKSIZE 1024
+#define SCHWA_CHARACTER 'a'
+
+static inline int is_consonant(unsigned int c)
+{
+	return (c >= 0x0915 && c <= 0x0939);
+}
+
+static inline int is_nasal(unsigned int c)
+{
+	return c == 0x0919 || c == 0x091e || c == 0x0923 ||
+	       c == 0x0928 || c == 0x092e || c == 0x0902;
+}
+
+static void nasal_consonants_filter(char *latin, unsigned int *pos,
+				    unsigned int prev, unsigned int c)
+{
+	char *tail = latin + *pos - 1;
+
+	if (is_nasal(prev)) {
+		/* rewrite nasals before labials to 'm' */
+		switch (c) {
+		case 0x092b: /* ph */
+		case 0x092d: /* bh */
+		case 0x092a: /* p */
+		case 0x092c: /* b */
+		case 0x092e: /* m */
+			*tail = 'm';
+			break;
+		default:
+			if (is_consonant(c) && *tail != SCHWA_CHARACTER)
+				*tail = 'n';
+			break;
+		}
+	}
+}
+
+static void end_of_word_filter(char *latin, unsigned int *pos,
+			       unsigned int prev, unsigned int c)
+{
+	unsigned int len;
+
+	if (is_devanagari(prev) && !is_devanagari(c)) {
+
+		/* shorten ending 'á' to 'a' */
+		if (prev == 0x093e) { /* modifier 'á' */
+			*pos = *pos - 1;
+			latin[*pos - 2] = 'a';
+		}
+
+		/* remove singular nominative suffix */
+		len = utf8_char_length(c);
+		if (prev == 0x094d && *(latin + *pos - 1 - len) == 'm') {
+			memmove(latin + *pos - 1 - len, latin + *pos - len, c);
+			*pos = *pos - 1;
+		}
+	}
+}
+
+static struct translit_letter *letter_by_code(struct translit_letter *table,
+					      unsigned int c)
+{
+	while (table->code != 0) {
+		if (table->code == c)
+			return table;
+		table++;
+	}
+
+	return NULL;
+}
+
+char *transcript_devanagari_to_czech(const char *devanagari)
+{
+	struct translit_letter *table, *letter;
+	unsigned int c, prev = 0, alloc = 0, done = 0, len;
+	const char *src = devanagari;
+	char *latin = NULL;
+
+	table = get_iast_czech_transliteration_table();
+
+	while (1) {
+		if (alloc < done + UNICODE_MAX_LENGTH) {
+			latin = realloc(latin, alloc + CHUNKSIZE);
+			alloc += CHUNKSIZE;
+		}
+
+		c = utf8_unpack_char(src);
+		len = utf8_char_length(c);
+		src += len;
+
+		nasal_consonants_filter(latin, &done, prev, c);
+
+		letter = letter_by_code(table, c);
+		if (letter) {
+			switch (letter->type) {
+			case CONSONANT:
+				strcpy(latin + done, letter->data);
+				done += strlen(letter->data);
+				*(latin + done++) = SCHWA_CHARACTER;
+				break;
+			case VOWEL_SIGN:
+				if (done)
+					done--;
+				strcpy(latin + done, letter->data);
+				done += strlen(letter->data);
+				break;
+			default:
+				strcpy(latin + done, letter->data);
+				done += strlen(letter->data);
+				break;
+			}
+		} else {
+			utf8_pack_char(latin + done, c);
+			done += len;
+		}
+
+		end_of_word_filter(latin, &done, prev, c);
+
+
+		if (c == 0)
+			break;
+		prev = c;
+	}
+
+	*(latin + done - 1) = '\0';
+
+	return latin;
+}
--- a/transcription.h
+++ b/transcription.h
@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __TRANSCRIPTION_H
+#define __TRANSCRIPTION_H
+
+char *transcript_devanagari_to_czech(const char *devanagari);
+
+#endif /* __TRANSCRIPTION_H */