implement transcription of hindi

2021-12-21 21:55:16 +01:00 · 2021-12-21 21:55:16 +01:00 · b881497064
commit b881497064
parent 5685e83933
16 changed files with 308 additions and 65 deletions
--- a/Makefile.in
+++ b/Makefile.in
@ -24,10 +24,10 @@ LFLAGS = @COVERAGE_LFLAGS@
 TEST_CFLAGS = @CFLAGS@ @CHECK_CFLAGS@
 TEST_LFLAGS = @CHECK_LIBS@ @COVERAGE_LFLAGS@
-OBJECTS = transliteration.o transcription.o utf8.o velthuis.o
+OBJECTS = transliteration.o czech.o hindi.o utf8.o velthuis.o
-TEST_OBJECTS = tests/main.o tests/translit.o tests/transcript.o 	\
+TEST_OBJECTS = tests/main.o tests/translit.o tests/czech.o		\
-	tests/velthuis.o tests/utf8.o tests/integration.o
+	tests/hindi.o tests/velthuis.o tests/utf8.o tests/integration.o
 AUX_FILES = Makefile configure aclocal.m4 install-sh config.h* *.log	\
 	*.status *.cache
--- a/compat.h
+++ b/compat.h
@ -12,8 +12,4 @@
 #define ARRAY_SIZE(a) sizeof(a) / sizeof(*a)
 enum err {
 	EHINDI = 1
 };
 #endif /* __COMPAT_H */
--- a/transcription.c
+++ b/transcription.c
@ -2,7 +2,7 @@
 /* https://cs.wikipedia.org/wiki/Wikipedie:Transkripce_hind%C5%A1tiny */
 #include "compat.h"
-#include "transcription.h"
+#include "czech.h"
 #include "transliteration.h"
 #include "utf8.h"
@ -193,11 +193,6 @@ int transcript_devanagari_to_czech(const char *devanagari, char **ret)
 		if (c == ZERO_WIDTH_JOINER)
 			continue;
 		if (c == NUKTA) {
 			*ret = NULL;
 			return EHINDI;
 		}
 		nasal_consonants_filter(latin, &done, prev, c);
 		letter = letter_by_code(c);
--- a/transcription.h
+++ b/transcription.h
@ -1,8 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __TRANSCRIPTION_H
+#ifndef __CZECH_H
-#define __TRANSCRIPTION_H
+#define __CZECH_H
 int transcript_devanagari_to_czech(const char *devanagari, char **ret);
-#endif /* __TRANSCRIPTION_H */
+#endif /* __CZECH_H */
--- a/hindi.c
+++ b/hindi.c
@ -0,0 +1,216 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* https://cs.wikipedia.org/wiki/Wikipedie:Transkripce_hind%C5%A1tiny */
 #include "compat.h"
 #include "hindi.h"
 #include "transliteration.h"
 #include "utf8.h"
 #define SCHWA_CHARACTER   'a'
 #define ZERO_WIDTH_JOINER 0x200d
 #define NUKTA             0x093c
 #define CHUNKSIZE         1024
 static struct translit_letter table[] = {
 	/* Special characters */
 	{0x0950, SPECIAL, "aum"},        /* aum */
 	/* Vowels */
 	{0x0910, VOWEL, "ai"},           /* 01 */
 	{0x0914, VOWEL, "au"},           /* 02 */
 	{0x0905, VOWEL, "a"},            /* 03 */
 	{0x0906, VOWEL, "aa"},           /* 04 */
 	{0x0907, VOWEL, "i"},            /* 05 */
 	{0x0908, VOWEL, "ee"},           /* 06 */
 	{0x0909, VOWEL, "u"},            /* 07 */
 	{0x090a, VOWEL, "oo"},           /* 08 */
 	{0x090b, VOWEL, "r"},            /* 09 */
 	{0x0960, VOWEL, "rr"},           /* 10 */
 	{0x090c, VOWEL, "l"},            /* 11 */
 	{0x0961, VOWEL, "ll"},           /* 12 */
 	{0x090f, VOWEL, "e"},            /* 13 */
 	{0x0913, VOWEL, "o"},            /* 14 */
 	{0x0911, VOWEL, "o"},            /* candra o */
 	{0x0912, VOWEL, "o"},            /* short o */
 	/* Consonants */
 	{0x0916, CONSONANT, "kh"},       /* 01 */
 	{0x0918, CONSONANT, "gh"},       /* 02 */
 	{0x091b, CONSONANT, "chh"},      /* 03 */
 	{0x091d, CONSONANT, "jh"},       /* 04 */
 	{0x0920, CONSONANT, "th"},       /* 05 */
 	{0x0922, CONSONANT, "dh"},       /* 06 */
 	{0x0925, CONSONANT, "th"},       /* 07 */
 	{0x0927, CONSONANT, "dh"},       /* 08 */
 	{0x092b, CONSONANT, "ph"},       /* 09 */
 	{0x092d, CONSONANT, "bh"},       /* 10 */
 	{0x0915, CONSONANT, "k"},        /* 11 */
 	{0x0917, CONSONANT, "g"},        /* 12 */
 	{0x0919, CONSONANT, "n"},        /* 13 */
 	{0x0939, CONSONANT, "h"},        /* 14 */
 	{0x091a, CONSONANT, "ch"},       /* 15 */
 	{0x091c, CONSONANT, "j"},        /* 16 */
 	{0x091e, CONSONANT, "n"},        /* 17 */
 	{0x092f, CONSONANT, "y"},        /* 18 */
 	{0x0936, CONSONANT, "sh"},       /* 19 */
 	{0x091f, CONSONANT, "t"},        /* 20 */
 	{0x0921, CONSONANT, "d"},        /* 21 */
 	{0x0923, CONSONANT, "n"},        /* 22 */
 	{0x0930, CONSONANT, "r"},        /* 23 */
 	{0x0937, CONSONANT, "sh"},       /* 24 */
 	{0x0924, CONSONANT, "t"},        /* 25 */
 	{0x0926, CONSONANT, "d"},        /* 26 */
 	{0x0928, CONSONANT, "n"},        /* 27 */
 	{0x0932, CONSONANT, "l"},        /* 28 */
 	{0x0938, CONSONANT, "s"},        /* 29 */
 	{0x092a, CONSONANT, "p"},        /* 30 */
 	{0x092c, CONSONANT, "b"},        /* 31 */
 	{0x092e, CONSONANT, "m"},        /* 32 */
 	{0x0935, CONSONANT, "v"},        /* 33 */
 	{0x0933, CONSONANT, "l"},        /* (.l) */
 	/* Additional consonants - idependent versions */
 	{0x0958, CONSONANT, "k"},
 	{0x0959, CONSONANT, "kh"},
 	{0x095a, CONSONANT, "g"},
 	{0x095b, CONSONANT, "z"},
 	{0x095c, CONSONANT, "d"},
 	{0x095d, CONSONANT, "dh"},
 	{0x095e, CONSONANT, "f"},
 	/* Codas */
 	{0x0902, CODA, "n"},             /* anusvara */
 	{0x0903, CODA, "h"},             /* visarga */
 	{0x093d, CODA, "'"},             /* avagrada (') */
 	{0x0901, CODA, "n"},             /* candrabindu */
 	/* Numbers */
 	{0x0966, NUMBER, "0"},
 	{0x0967, NUMBER, "1"},
 	{0x0968, NUMBER, "2"},
 	{0x0969, NUMBER, "3"},
 	{0x096a, NUMBER, "4"},
 	{0x096b, NUMBER, "5"},
 	{0x096c, NUMBER, "6"},
 	{0x096d, NUMBER, "7"},
 	{0x096e, NUMBER, "8"},
 	{0x096f, NUMBER, "9"},
 	/* Diacritic modifiers */
 	{0x0948, VOWEL_SIGN, "ai"},
 	{0x094c, VOWEL_SIGN, "au"},
 	{0x093e, VOWEL_SIGN, "aa"},
 	{0x093f, VOWEL_SIGN, "i"},
 	{0x0940, VOWEL_SIGN, "ee"},
 	{0x0941, VOWEL_SIGN, "u"},
 	{0x0942, VOWEL_SIGN, "oo"},
 	{0x0943, VOWEL_SIGN, "r"},
 	{0x0944, VOWEL_SIGN, "rr"},
 	{0x0962, VOWEL_SIGN, "l"},
 	{0x0963, VOWEL_SIGN, "ll"},
 	{0x0947, VOWEL_SIGN, "e"},
 	{0x094b, VOWEL_SIGN, "o"},
 	{0x0949, VOWEL_SIGN, "o"},       /* candra o */
 	{0x094a, VOWEL_SIGN, "o"},       /* short o */
 	{0x094d, VOWEL_SIGN, ""},        /* virama */
 	{0x0965, CODA, "||"},            /* double danda */
 	{0x0964, CODA, "|"},             /* danda */
 };
 static struct translit_letter *letter_by_code(unsigned int c)
 {
 	unsigned int i;
 	for (i = 0; i < ARRAY_SIZE(table); i++) {
 		if (table[i].code == c) {
 			return table + i;
 		}
 	}
 	return NULL;
 }
 static void nukta_filter(char *latin, unsigned int *pos, unsigned int prev)
 {
 	switch (prev) {
 	case 0x091c: /* z */
 		latin[*pos - 2] = 'z';
 		break;
 	case 0x091d: /* zh */
 		latin[*pos - 3] = 'z';
 		break;
 	case 0x092b:
 		strcpy(latin + *pos - 3, "fa");
 		*pos = *pos - 1;
 		break;
 	}
 }
 int transcript_devanagari_to_hindi(const char *devanagari, char **ret)
 {
 	struct translit_letter *letter;
 	unsigned int c, prev = 0, alloc = 0, done = 0, len;
 	const char *src = devanagari;
 	char *latin = NULL;
 	while (1) {
 		if (alloc < done + UNICODE_MAX_LENGTH) {
 			latin = realloc(latin, alloc + CHUNKSIZE);
 			alloc += CHUNKSIZE;
 		}
 		c = utf8_unpack_char(src);
 		len = utf8_char_length(c);
 		src += len;
 		if (c == ZERO_WIDTH_JOINER)
 			continue;
 		letter = letter_by_code(c);
 		if (letter) {
 			switch (letter->type) {
 			case CONSONANT:
 				strcpy(latin + done, letter->data);
 				done += strlen(letter->data);
 				*(latin + done++) = SCHWA_CHARACTER;
 				break;
 			case VOWEL_SIGN:
 				if (done) {
 					/* delete the inherent schwa */
 					done--;
 				}
 			default:
 				strcpy(latin + done, letter->data);
 				done += strlen(letter->data);
 				break;
 			}
 		} else {
 			if (done && c == NUKTA) {
 				nukta_filter(latin, &done, prev);
 				goto next;
 			}
 			/* remove the final schwa */
 			if (is_devanagari(prev) && !is_devanagari(c)) {
 				if (latin[done - 1] == SCHWA_CHARACTER) {
 					latin[--done] = '\0';
 				}
 			}
 			utf8_pack_char(latin + done, c);
 			done += len;
 		}
 next:
 		if (c == 0)
 			break;
 		prev = c;
 	}
 	*(latin + done - 1) = '\0';
 	*ret = latin;
 	return 0;
 }
--- a/hindi.h
+++ b/hindi.h
@ -0,0 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __HINDI_H
 #define __HINDI_H
 int transcript_devanagari_to_hindi(const char *devanagari, char **ret);
 #endif /* __HINDI_H */
--- a/main.c
+++ b/main.c
@ -1,14 +1,16 @@
 #include "compat.h"
 #include "transliteration.h"
-#include "transcription.h"
+#include "czech.h"
 #include "hindi.h"
 #include "velthuis.h"
 #include "config.h"
 #define FLAG_REVERSE	1 << 0
 #define FLAG_VELTHUIS	1 << 1
 #define FLAG_CZECH	1 << 2
-#define FLAG_ASCII	1 << 3
+#define FLAG_HINDI	1 << 3
-#define FLAG_DEVANAGARI	1 << 4
+#define FLAG_ASCII	1 << 4
 #define FLAG_DEVANAGARI	1 << 5
 static const char *usage_str =
 	PROGNAME ", a helper for Sanskrit transliteration.\n"
@ -23,6 +25,7 @@ static const char *usage_str =
 	"  -a, --ascii       convert a Devanagari text to Velthuis text rather than to IAST\n"
 	"  -d, --devanagari  when encoding, output a Devanagari text rather than IAST\n"
 	"  -c, --czech       transcript Devanagari to Czech language (experimental)\n"
 	"  -H, --hindi       transcript Hindi from Devanagari to Latin (experimental)\n"
 	"  -h, --help        show this help and exit\n"
 	"  -v, --version     show version number and exit\n"
 	"\n"
@ -38,7 +41,7 @@ static const char *usage_str =
 	"\n"
 	"  For more information see the iast(1) manual page.\n";
-static const char *short_opts = "f:readchv";
+static const char *short_opts = "f:readcHhv";
 static const struct option long_opts[] = {
 	{"file",       required_argument,  0, 'f'},
@ -48,6 +51,7 @@ static const struct option long_opts[] = {
 	{"ascii",      no_argument,        0, 'a'},
 	{"devanagari", no_argument,        0, 'd'},
 	{"czech",      no_argument,        0, 'c'},
 	{"hindi",      no_argument,        0, 'H'},
 	{"help",       no_argument,        0, 'h'},
 	{"version",    no_argument,        0, 'v'},
 	{0, 0, 0, 0}
@ -104,6 +108,9 @@ static int process_input(const char *input, char **out, unsigned int flags)
 	if (flags & FLAG_CZECH)
 		return transcript_devanagari_to_czech(input, out);
 	if (flags & FLAG_HINDI)
 		return transcript_devanagari_to_hindi(input, out);
 	if (flags & FLAG_ASCII) {
 		ret = transliterate_devanagari_to_latin(input, &tmp);
 		if (ret != 0)
@ -128,9 +135,6 @@ static int process_string(const char *input, unsigned int flags)
 	case 0:
 		fprintf(stdout, "%s", output);
 		break;
 	case EHINDI:
 		error("the input text is Hindi.");
 		break;
 	default:
 		error("unexpected error.");
 		break;
@ -225,6 +229,9 @@ int main(int argc, const char **argv)
 		case 'c':
 			flags |= FLAG_CZECH;
 			break;
 		case 'H':
 			flags |= FLAG_HINDI;
 			break;
 		case 'h':
 			print_usage();
 			return 0;
--- a/tests/transcript.c
+++ b/tests/transcript.c
@ -1,7 +1,7 @@
 #include "test.h"
-#include "transcript.h"
+#include "czech.h"
 #include "../compat.h"
-#include "../transcription.h"
+#include "../czech.h"
 static void test_transcript(const char *devanagari, const char *latin)
 {
@ -34,18 +34,7 @@ START_TEST(test_transcript_devanagari_to_czech)
 }
 END_TEST
-START_TEST(test_transcript_detect_hindi)
+void register_transcript_czech_tests(TCase *test_case)
 {
 	char *czech;
 	int ret;
 	ret = transcript_devanagari_to_czech("लड़की", &czech);
 	ck_assert_int_eq(EHINDI, ret);
 }
 END_TEST
 void register_transcript_tests(TCase *test_case)
 {
 	tcase_add_test(test_case, test_transcript_devanagari_to_czech);
 	tcase_add_test(test_case, test_transcript_detect_hindi);
 }
--- a/tests/czech.h
+++ b/tests/czech.h
@ -0,0 +1,8 @@
 #ifndef __TEST_CZECH_H
 #define __TEST_CZECH_H
 #include <check.h>
 void register_transcript_czech_tests(TCase *test_case);
 #endif /* __TEST_CZECH_H */
--- a/tests/hindi.c
+++ b/tests/hindi.c
@ -0,0 +1,30 @@
 #include "test.h"
 #include "hindi.h"
 #include "../compat.h"
 #include "../hindi.h"
 static void test_transcript(const char *devanagari, const char *latin)
 {
 	char *hindi;
 	int ret;
 	ret = transcript_devanagari_to_hindi(devanagari, &hindi);
 	ck_assert_int_eq(0, ret);
 	ck_assert_str_eq(latin, hindi);
 	free(hindi);
 }
 START_TEST(test_transcript_devanagari_to_hindi)
 {
 	test_transcript("क़ ख़ ग़ ज़ झ़ ड़ ढ़ फ़", "k kh g z zh d dh f"); /* composite */
 	test_transcript("क़ ख़ ग़ ज़ ड़ ढ़ फ़", "k kh g z d dh f"); /* independent */
 	test_transcript("कभी ख़ुशी कभी ग़म", "kabhee khushee kabhee gam");
 	test_transcript("मैं एक लड़का हूँ और तुम एक लड़की हो", "main ek ladaka hoon aur tum ek ladakee ho");
 	test_transcript("स्कॉट्लैण्ड ऑरेंज", "skotlaind orenj"); /* o */
 }
 END_TEST
 void register_transcript_hindi_tests(TCase *test_case)
 {
 	tcase_add_test(test_case, test_transcript_devanagari_to_hindi);
 }
--- a/tests/hindi.h
+++ b/tests/hindi.h
@ -0,0 +1,8 @@
 #ifndef __TEST_HINDI_H
 #define __TEST_HINDI_H
 #include <check.h>
 void register_transcript_hindi_tests(TCase *test_case);
 #endif /* __TEST_HINDI_H */
--- a/tests/integration.c
+++ b/tests/integration.c
@ -88,13 +88,20 @@ START_TEST(test_transliterate_arguments)
 }
 END_TEST
-START_TEST(test_transcript)
+START_TEST(test_transcript_czech)
 {
 	test_output("./iast -c \"भगवद्गीता\"", "bhagavadgíta\n");
 	test_output("./iast --czech \"तन्त्रशास्त्रम्\"", "tantrašástra\n");
 }
 END_TEST
 START_TEST(test_transcript_hindi)
 {
 	test_output("./iast -H \"हिन्दी\"", "hindee\n");
 	test_output("./iast --hindi \"लड़की\"", "ladakee\n");
 }
 END_TEST
 START_TEST(test_velthuis)
 {
 	test_output("./iast \".rta.m ca satyam\" -e", "ṛtaṃ ca satyam\n");
@ -141,7 +148,6 @@ START_TEST(test_errors)
 {
 	test_output("./iast -x 2>&1", "[iast] error: unrecognised option '-x'.\n");
 	test_output("./iast -f xxx 2>&1", "[iast] error: failed to read file 'xxx'.\n");
 	test_output("./iast \u0921\u093c 2>&1", "[iast] error: the input text is Hindi.\n");
 }
 END_TEST
@ -149,7 +155,8 @@ void register_integration_tests(TCase *test_case)
 {
 	tcase_add_test(test_case, test_transliterate_files);
 	tcase_add_test(test_case, test_transliterate_arguments);
-	tcase_add_test(test_case, test_transcript);
+	tcase_add_test(test_case, test_transcript_czech);
 	tcase_add_test(test_case, test_transcript_hindi);
 	tcase_add_test(test_case, test_velthuis);
 	tcase_add_test(test_case, test_ascii);
 	tcase_add_test(test_case, test_version);
--- a/tests/main.c
+++ b/tests/main.c
@ -1,6 +1,7 @@
 #include "test.h"
 #include "translit.h"
-#include "transcript.h"
+#include "czech.h"
 #include "hindi.h"
 #include "velthuis.h"
 #include "utf8.h"
 #include "integration.h"
@ -14,7 +15,8 @@ static Suite *create_test_suite()
 	test_case = tcase_create(NULL);
 	register_translit_tests(test_case);
-	register_transcript_tests(test_case);
+	register_transcript_czech_tests(test_case);
 	register_transcript_hindi_tests(test_case);
 	register_velthuis_encoder_tests(test_case);
 	register_utf8_tests(test_case);
--- a/tests/transcript.h
+++ b/tests/transcript.h
@ -1,8 +0,0 @@
 #ifndef __TEST_TRANSCRIPT_H
 #define __TEST_TRANSCRIPT_H
 #include <check.h>
 void register_transcript_tests(TCase *test_case);
 #endif /* __TEST_TRANSCRIPT_H */
--- a/tests/translit.c
+++ b/tests/translit.c
@ -84,14 +84,6 @@ START_TEST(test_translit_zero_width_joiner)
 }
 END_TEST
 START_TEST(test_translit_detect_hindi)
 {
 	char *hindi = NULL;
 	int ret = transliterate_devanagari_to_latin("लड़की", &hindi);
 	ck_assert_int_eq(EHINDI, ret);
 }
 END_TEST
 void register_translit_tests(TCase *test_case)
 {
 	tcase_add_test(test_case, test_translit_devanagari_to_latin);
@ -99,5 +91,4 @@ void register_translit_tests(TCase *test_case)
 	tcase_add_test(test_case, test_translit_lla_sylable);
 	tcase_add_test(test_case, test_translit_candrabindu);
 	tcase_add_test(test_case, test_translit_zero_width_joiner);
 	tcase_add_test(test_case, test_translit_detect_hindi);
 }
--- a/transliteration.c
+++ b/transliteration.c
@ -8,7 +8,6 @@
 #define SCHWA_CHARACTER   'a'
 #define ZERO_WIDTH_JOINER 0x200d
 #define VIRAMA            0x094d
 #define NUKTA             0x093c
 #define CHUNKSIZE         1024
 static struct translit_letter table[] = {
@ -136,11 +135,6 @@ int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
 		len = utf8_char_length(c);
 		src += len;
 		if (c == NUKTA) {
 			*ret = NULL;
 			return EHINDI;
 		}
 		letter = letter_by_code(c);
 		if (letter) {
 			switch (letter->type) {