From 9c3db8ebb850b0945383160e6aa513eb61e64970 Mon Sep 17 00:00:00 2001 From: Vlasta Vesely Date: Thu, 2 Jan 2020 16:12:51 +0100 Subject: [PATCH] reimplement czech transcription of sanskrit --- Makefile | 4 +- iast-czech.c | 101 +++++++++++++++++++++++++++++++++ iast-czech.h | 10 ++++ tests/main.c | 2 + tests/transcript.c | 34 ++++++++++++ tests/transcript.h | 8 +++ transcription.c | 135 +++++++++++++++++++++++++++++++++++++++++++++ transcription.h | 8 +++ 8 files changed, 300 insertions(+), 2 deletions(-) create mode 100644 iast-czech.c create mode 100644 iast-czech.h create mode 100644 tests/transcript.c create mode 100644 tests/transcript.h create mode 100644 transcription.c create mode 100644 transcription.h diff --git a/Makefile b/Makefile index b60b0dc..24b2f7a 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,8 @@ PREFIX=/usr .PHONY: main test install uninstall clean -OBJECTS = iast.o transliteration.o utf8.o encoder.o -TEST_OBJECTS = tests/main.o tests/translit.o +OBJECTS = iast.o iast-czech.o transliteration.o transcription.o utf8.o encoder.o +TEST_OBJECTS = tests/main.o tests/translit.o tests/transcript.o CFLAGS = -Wall LIBS = TEST_CFLAGS = $(CFLAGS) $(shell pkg-config --cflags check) diff --git a/iast-czech.c b/iast-czech.c new file mode 100644 index 0000000..0a28876 --- /dev/null +++ b/iast-czech.c @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "compat.h" +#include "iast-czech.h" + +static struct translit_letter table[] = { + + /* Special characters */ + {0x0950, SPECIAL, "óm"}, /* aum */ + + /* Vowels */ + {0x0910, VOWEL, "ai"}, /* 01 */ + {0x0914, VOWEL, "au"}, /* 02 */ + {0x0905, VOWEL, "a"}, /* 03 */ + {0x0906, VOWEL, "á"}, /* 04 */ + {0x0907, VOWEL, "i"}, /* 05 */ + {0x0908, VOWEL, "í"}, /* 06 */ + {0x0909, VOWEL, "u"}, /* 07 */ + {0x090a, VOWEL, "ú"}, /* 08 */ + {0x090b, VOWEL, "r"}, /* 09 */ + {0x0960, VOWEL, "r"}, /* 10 */ + {0x090c, VOWEL, "l"}, /* 11 */ + {0x0961, VOWEL, "l"}, /* 12 */ + {0x090f, VOWEL, "é"}, /* 13 */ + {0x0913, VOWEL, "ó"}, /* 14 */ + + /* Consonants */ + {0x0916, CONSONANT, "kh"}, /* 01 */ + {0x0918, CONSONANT, "gh"}, /* 02 */ + {0x091b, CONSONANT, "ch"}, /* 03 */ + {0x091d, CONSONANT, "džh"}, /* 04 */ + {0x091c, CONSONANT, "dž"}, /* 05 */ + {0x0920, CONSONANT, "th"}, /* 06 */ + {0x0922, CONSONANT, "dh"}, /* 07 */ + {0x0925, CONSONANT, "th"}, /* 08 */ + {0x0927, CONSONANT, "dh"}, /* 09 */ + {0x092b, CONSONANT, "ph"}, /* 10 */ + {0x092d, CONSONANT, "bh"}, /* 11 */ + {0x0915, CONSONANT, "k"}, /* 12 */ + {0x0917, CONSONANT, "g"}, /* 13 */ + {0x0919, CONSONANT, "n"}, /* 14 */ + {0x0939, CONSONANT, "h"}, /* 15 */ + {0x091a, CONSONANT, "c"}, /* 16 */ + {0x091e, CONSONANT, "ň"}, /* 17 */ + {0x092f, CONSONANT, "j"}, /* 18 */ + {0x0936, CONSONANT, "š"}, /* 19 */ + {0x091F, CONSONANT, "t"}, /* 20 */ + {0x0921, CONSONANT, "d"}, /* 21 */ + {0x0923, CONSONANT, "n"}, /* 22 */ + {0x0930, CONSONANT, "r"}, /* 23 */ + {0x0937, CONSONANT, "š"}, /* 24 */ + {0x0924, CONSONANT, "t"}, /* 25 */ + {0x0926, CONSONANT, "d"}, /* 26 */ + {0x0928, CONSONANT, "n"}, /* 27 */ + {0x0932, CONSONANT, "l"}, /* 28 */ + {0x0938, CONSONANT, "s"}, /* 29 */ + {0x092a, CONSONANT, "p"}, /* 30 */ + {0x092c, CONSONANT, "b"}, /* 31 */ + {0x092e, CONSONANT, "m"}, /* 32 */ + {0x0935, CONSONANT, "v"}, /* 33 */ + + /* Codas */ + {0x0902, CODA, "m"}, /* anusvara */ + {0x0903, CODA, ""}, /* visarga */ + {0x093d, CODA, "'"}, /* avagrada */ + + /* Numbers */ + {0x0966, NUMBER, "0"}, + {0x0967, NUMBER, "1"}, + {0x0968, NUMBER, "2"}, + {0x0969, NUMBER, "3"}, + {0x096a, NUMBER, "4"}, + {0x096b, NUMBER, "5"}, + {0x096c, NUMBER, "6"}, + {0x096d, NUMBER, "7"}, + {0x096e, NUMBER, "8"}, + {0x096f, NUMBER, "9"}, + + /* Diacritic modifiers */ + {0x0948, VOWEL_SIGN, "ai"}, + {0x094c, VOWEL_SIGN, "au"}, + {0x093e, VOWEL_SIGN, "á"}, + {0x093f, VOWEL_SIGN, "i"}, + {0x0940, VOWEL_SIGN, "í"}, + {0x0941, VOWEL_SIGN, "u"}, + {0x0942, VOWEL_SIGN, "ú"}, + {0x0943, VOWEL_SIGN, "r"}, + {0x0944, VOWEL_SIGN, "r"}, + {0x0962, VOWEL_SIGN, "l"}, + {0x0963, VOWEL_SIGN, "l"}, + {0x0947, VOWEL_SIGN, "é"}, + {0x094b, VOWEL_SIGN, "ó"}, + {0x094d, VOWEL_SIGN, ""}, /* virama */ + + {0, 0, NULL} +}; + +struct translit_letter *get_iast_czech_transliteration_table() +{ + return table; +} diff --git a/iast-czech.h b/iast-czech.h new file mode 100644 index 0000000..8bc3f96 --- /dev/null +++ b/iast-czech.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __IAST_CZECH_H +#define __IAST_CZECH_H + +#include "transliteration.h" + +struct translit_letter *get_iast_czech_transliteration_table(); + +#endif /* __IAST_CZECH_H */ diff --git a/tests/main.c b/tests/main.c index 27ab370..d77faff 100644 --- a/tests/main.c +++ b/tests/main.c @@ -1,5 +1,6 @@ #include "test.h" #include "translit.h" +#include "transcript.h" static Suite *create_test_suite() { @@ -10,6 +11,7 @@ static Suite *create_test_suite() test_case = tcase_create(NULL); register_translit_tests(test_case); + register_transcript_tests(test_case); suite_add_tcase(suite, test_case); return suite; diff --git a/tests/transcript.c b/tests/transcript.c new file mode 100644 index 0000000..7ee5d6c --- /dev/null +++ b/tests/transcript.c @@ -0,0 +1,34 @@ +#include "test.h" +#include "transcript.h" +#include "../transcription.h" + +START_TEST(test_transcript_devanagari_to_czech) +{ + char *czech; + + czech = transcript_devanagari_to_czech("तन्त्रशास्त्रम्"); + ck_assert_str_eq("tantrašástra", czech); + free(czech); + + czech = transcript_devanagari_to_czech("सांख्य"); + ck_assert_str_eq("sánkhja", czech); + free(czech); + + czech = transcript_devanagari_to_czech("महाभारतम्"); + ck_assert_str_eq("mahábhárata", czech); + free(czech); + + czech = transcript_devanagari_to_czech("योगः"); + ck_assert_str_eq("jóga", czech); + free(czech); + + czech = transcript_devanagari_to_czech("भगवद्गीता"); + ck_assert_str_eq("bhagavadgíta", czech); + free(czech); +} +END_TEST + +void register_transcript_tests(TCase *test_case) +{ + tcase_add_test(test_case, test_transcript_devanagari_to_czech); +} diff --git a/tests/transcript.h b/tests/transcript.h new file mode 100644 index 0000000..efc007d --- /dev/null +++ b/tests/transcript.h @@ -0,0 +1,8 @@ +#ifndef __TEST_TRANSCRIPT_H +#define __TEST_TRANSCRIPT_H + +#include + +void register_transcript_tests(TCase *test_case); + +#endif /* __TEST_TRANSCRIPT_H */ diff --git a/transcription.c b/transcription.c new file mode 100644 index 0000000..194fecd --- /dev/null +++ b/transcription.c @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "compat.h" +#include "transcription.h" +#include "iast-czech.h" +#include "utf8.h" + +#define CHUNKSIZE 1024 +#define SCHWA_CHARACTER 'a' + +static inline int is_consonant(unsigned int c) +{ + return (c >= 0x0915 && c <= 0x0939); +} + +static inline int is_nasal(unsigned int c) +{ + return c == 0x0919 || c == 0x091e || c == 0x0923 || + c == 0x0928 || c == 0x092e || c == 0x0902; +} + +static void nasal_consonants_filter(char *latin, unsigned int *pos, + unsigned int prev, unsigned int c) +{ + char *tail = latin + *pos - 1; + + if (is_nasal(prev)) { + /* rewrite nasals before labials to 'm' */ + switch (c) { + case 0x092b: /* ph */ + case 0x092d: /* bh */ + case 0x092a: /* p */ + case 0x092c: /* b */ + case 0x092e: /* m */ + *tail = 'm'; + break; + default: + if (is_consonant(c) && *tail != SCHWA_CHARACTER) + *tail = 'n'; + break; + } + } +} + +static void end_of_word_filter(char *latin, unsigned int *pos, + unsigned int prev, unsigned int c) +{ + unsigned int len; + + if (is_devanagari(prev) && !is_devanagari(c)) { + + /* shorten ending 'á' to 'a' */ + if (prev == 0x093e) { /* modifier 'á' */ + *pos = *pos - 1; + latin[*pos - 2] = 'a'; + } + + /* remove singular nominative suffix */ + len = utf8_char_length(c); + if (prev == 0x094d && *(latin + *pos - 1 - len) == 'm') { + memmove(latin + *pos - 1 - len, latin + *pos - len, c); + *pos = *pos - 1; + } + } +} + +static struct translit_letter *letter_by_code(struct translit_letter *table, + unsigned int c) +{ + while (table->code != 0) { + if (table->code == c) + return table; + table++; + } + + return NULL; +} + +char *transcript_devanagari_to_czech(const char *devanagari) +{ + struct translit_letter *table, *letter; + unsigned int c, prev = 0, alloc = 0, done = 0, len; + const char *src = devanagari; + char *latin = NULL; + + table = get_iast_czech_transliteration_table(); + + while (1) { + if (alloc < done + UNICODE_MAX_LENGTH) { + latin = realloc(latin, alloc + CHUNKSIZE); + alloc += CHUNKSIZE; + } + + c = utf8_unpack_char(src); + len = utf8_char_length(c); + src += len; + + nasal_consonants_filter(latin, &done, prev, c); + + letter = letter_by_code(table, c); + if (letter) { + switch (letter->type) { + case CONSONANT: + strcpy(latin + done, letter->data); + done += strlen(letter->data); + *(latin + done++) = SCHWA_CHARACTER; + break; + case VOWEL_SIGN: + if (done) + done--; + strcpy(latin + done, letter->data); + done += strlen(letter->data); + break; + default: + strcpy(latin + done, letter->data); + done += strlen(letter->data); + break; + } + } else { + utf8_pack_char(latin + done, c); + done += len; + } + + end_of_word_filter(latin, &done, prev, c); + + + if (c == 0) + break; + prev = c; + } + + *(latin + done - 1) = '\0'; + + return latin; +} diff --git a/transcription.h b/transcription.h new file mode 100644 index 0000000..7b40dd8 --- /dev/null +++ b/transcription.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __TRANSCRIPTION_H +#define __TRANSCRIPTION_H + +char *transcript_devanagari_to_czech(const char *devanagari); + +#endif /* __TRANSCRIPTION_H */