diff --git a/Makefile.in b/Makefile.in index d9eb42d..8ec6d05 100644 --- a/Makefile.in +++ b/Makefile.in @@ -24,10 +24,10 @@ LFLAGS = @COVERAGE_LFLAGS@ TEST_CFLAGS = @CFLAGS@ @CHECK_CFLAGS@ TEST_LFLAGS = @CHECK_LIBS@ @COVERAGE_LFLAGS@ -OBJECTS = transliteration.o transcription.o utf8.o velthuis.o +OBJECTS = transliteration.o czech.o hindi.o utf8.o velthuis.o -TEST_OBJECTS = tests/main.o tests/translit.o tests/transcript.o \ - tests/velthuis.o tests/utf8.o tests/integration.o +TEST_OBJECTS = tests/main.o tests/translit.o tests/czech.o \ + tests/hindi.o tests/velthuis.o tests/utf8.o tests/integration.o AUX_FILES = Makefile configure aclocal.m4 install-sh config.h* *.log \ *.status *.cache diff --git a/compat.h b/compat.h index 20bfc27..8d2fb8d 100644 --- a/compat.h +++ b/compat.h @@ -12,8 +12,4 @@ #define ARRAY_SIZE(a) sizeof(a) / sizeof(*a) -enum err { - EHINDI = 1 -}; - #endif /* __COMPAT_H */ diff --git a/transcription.c b/czech.c similarity index 98% rename from transcription.c rename to czech.c index 68049f2..7a4a206 100644 --- a/transcription.c +++ b/czech.c @@ -2,7 +2,7 @@ /* https://cs.wikipedia.org/wiki/Wikipedie:Transkripce_hind%C5%A1tiny */ #include "compat.h" -#include "transcription.h" +#include "czech.h" #include "transliteration.h" #include "utf8.h" @@ -193,11 +193,6 @@ int transcript_devanagari_to_czech(const char *devanagari, char **ret) if (c == ZERO_WIDTH_JOINER) continue; - if (c == NUKTA) { - *ret = NULL; - return EHINDI; - } - nasal_consonants_filter(latin, &done, prev, c); letter = letter_by_code(c); diff --git a/transcription.h b/czech.h similarity index 57% rename from transcription.h rename to czech.h index 11ccbdb..a972b77 100644 --- a/transcription.h +++ b/czech.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __TRANSCRIPTION_H -#define __TRANSCRIPTION_H +#ifndef __CZECH_H +#define __CZECH_H int transcript_devanagari_to_czech(const char *devanagari, char **ret); -#endif /* __TRANSCRIPTION_H */ +#endif /* __CZECH_H */ diff --git a/hindi.c b/hindi.c new file mode 100644 index 0000000..1e61627 --- /dev/null +++ b/hindi.c @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* https://cs.wikipedia.org/wiki/Wikipedie:Transkripce_hind%C5%A1tiny */ + +#include "compat.h" +#include "hindi.h" +#include "transliteration.h" +#include "utf8.h" + +#define SCHWA_CHARACTER 'a' +#define ZERO_WIDTH_JOINER 0x200d +#define NUKTA 0x093c +#define CHUNKSIZE 1024 + +static struct translit_letter table[] = { + + /* Special characters */ + {0x0950, SPECIAL, "aum"}, /* aum */ + + /* Vowels */ + {0x0910, VOWEL, "ai"}, /* 01 */ + {0x0914, VOWEL, "au"}, /* 02 */ + {0x0905, VOWEL, "a"}, /* 03 */ + {0x0906, VOWEL, "aa"}, /* 04 */ + {0x0907, VOWEL, "i"}, /* 05 */ + {0x0908, VOWEL, "ee"}, /* 06 */ + {0x0909, VOWEL, "u"}, /* 07 */ + {0x090a, VOWEL, "oo"}, /* 08 */ + {0x090b, VOWEL, "r"}, /* 09 */ + {0x0960, VOWEL, "rr"}, /* 10 */ + {0x090c, VOWEL, "l"}, /* 11 */ + {0x0961, VOWEL, "ll"}, /* 12 */ + {0x090f, VOWEL, "e"}, /* 13 */ + {0x0913, VOWEL, "o"}, /* 14 */ + {0x0911, VOWEL, "o"}, /* candra o */ + {0x0912, VOWEL, "o"}, /* short o */ + + /* Consonants */ + {0x0916, CONSONANT, "kh"}, /* 01 */ + {0x0918, CONSONANT, "gh"}, /* 02 */ + {0x091b, CONSONANT, "chh"}, /* 03 */ + {0x091d, CONSONANT, "jh"}, /* 04 */ + {0x0920, CONSONANT, "th"}, /* 05 */ + {0x0922, CONSONANT, "dh"}, /* 06 */ + {0x0925, CONSONANT, "th"}, /* 07 */ + {0x0927, CONSONANT, "dh"}, /* 08 */ + {0x092b, CONSONANT, "ph"}, /* 09 */ + {0x092d, CONSONANT, "bh"}, /* 10 */ + {0x0915, CONSONANT, "k"}, /* 11 */ + {0x0917, CONSONANT, "g"}, /* 12 */ + {0x0919, CONSONANT, "n"}, /* 13 */ + {0x0939, CONSONANT, "h"}, /* 14 */ + {0x091a, CONSONANT, "ch"}, /* 15 */ + {0x091c, CONSONANT, "j"}, /* 16 */ + {0x091e, CONSONANT, "n"}, /* 17 */ + {0x092f, CONSONANT, "y"}, /* 18 */ + {0x0936, CONSONANT, "sh"}, /* 19 */ + {0x091f, CONSONANT, "t"}, /* 20 */ + {0x0921, CONSONANT, "d"}, /* 21 */ + {0x0923, CONSONANT, "n"}, /* 22 */ + {0x0930, CONSONANT, "r"}, /* 23 */ + {0x0937, CONSONANT, "sh"}, /* 24 */ + {0x0924, CONSONANT, "t"}, /* 25 */ + {0x0926, CONSONANT, "d"}, /* 26 */ + {0x0928, CONSONANT, "n"}, /* 27 */ + {0x0932, CONSONANT, "l"}, /* 28 */ + {0x0938, CONSONANT, "s"}, /* 29 */ + {0x092a, CONSONANT, "p"}, /* 30 */ + {0x092c, CONSONANT, "b"}, /* 31 */ + {0x092e, CONSONANT, "m"}, /* 32 */ + {0x0935, CONSONANT, "v"}, /* 33 */ + {0x0933, CONSONANT, "l"}, /* (.l) */ + + /* Additional consonants - idependent versions */ + {0x0958, CONSONANT, "k"}, + {0x0959, CONSONANT, "kh"}, + {0x095a, CONSONANT, "g"}, + {0x095b, CONSONANT, "z"}, + {0x095c, CONSONANT, "d"}, + {0x095d, CONSONANT, "dh"}, + {0x095e, CONSONANT, "f"}, + + /* Codas */ + {0x0902, CODA, "n"}, /* anusvara */ + {0x0903, CODA, "h"}, /* visarga */ + {0x093d, CODA, "'"}, /* avagrada (') */ + {0x0901, CODA, "n"}, /* candrabindu */ + + /* Numbers */ + {0x0966, NUMBER, "0"}, + {0x0967, NUMBER, "1"}, + {0x0968, NUMBER, "2"}, + {0x0969, NUMBER, "3"}, + {0x096a, NUMBER, "4"}, + {0x096b, NUMBER, "5"}, + {0x096c, NUMBER, "6"}, + {0x096d, NUMBER, "7"}, + {0x096e, NUMBER, "8"}, + {0x096f, NUMBER, "9"}, + + /* Diacritic modifiers */ + {0x0948, VOWEL_SIGN, "ai"}, + {0x094c, VOWEL_SIGN, "au"}, + {0x093e, VOWEL_SIGN, "aa"}, + {0x093f, VOWEL_SIGN, "i"}, + {0x0940, VOWEL_SIGN, "ee"}, + {0x0941, VOWEL_SIGN, "u"}, + {0x0942, VOWEL_SIGN, "oo"}, + {0x0943, VOWEL_SIGN, "r"}, + {0x0944, VOWEL_SIGN, "rr"}, + {0x0962, VOWEL_SIGN, "l"}, + {0x0963, VOWEL_SIGN, "ll"}, + {0x0947, VOWEL_SIGN, "e"}, + {0x094b, VOWEL_SIGN, "o"}, + {0x0949, VOWEL_SIGN, "o"}, /* candra o */ + {0x094a, VOWEL_SIGN, "o"}, /* short o */ + {0x094d, VOWEL_SIGN, ""}, /* virama */ + + {0x0965, CODA, "||"}, /* double danda */ + {0x0964, CODA, "|"}, /* danda */ +}; + +static struct translit_letter *letter_by_code(unsigned int c) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(table); i++) { + if (table[i].code == c) { + return table + i; + } + } + + return NULL; +} + +static void nukta_filter(char *latin, unsigned int *pos, unsigned int prev) +{ + switch (prev) { + case 0x091c: /* z */ + latin[*pos - 2] = 'z'; + break; + case 0x091d: /* zh */ + latin[*pos - 3] = 'z'; + break; + case 0x092b: + strcpy(latin + *pos - 3, "fa"); + *pos = *pos - 1; + break; + } +} + +int transcript_devanagari_to_hindi(const char *devanagari, char **ret) +{ + struct translit_letter *letter; + unsigned int c, prev = 0, alloc = 0, done = 0, len; + const char *src = devanagari; + char *latin = NULL; + + while (1) { + if (alloc < done + UNICODE_MAX_LENGTH) { + latin = realloc(latin, alloc + CHUNKSIZE); + alloc += CHUNKSIZE; + } + + c = utf8_unpack_char(src); + len = utf8_char_length(c); + src += len; + + if (c == ZERO_WIDTH_JOINER) + continue; + + letter = letter_by_code(c); + if (letter) { + switch (letter->type) { + case CONSONANT: + strcpy(latin + done, letter->data); + done += strlen(letter->data); + *(latin + done++) = SCHWA_CHARACTER; + break; + case VOWEL_SIGN: + if (done) { + /* delete the inherent schwa */ + done--; + } + default: + strcpy(latin + done, letter->data); + done += strlen(letter->data); + break; + } + } else { + if (done && c == NUKTA) { + nukta_filter(latin, &done, prev); + goto next; + } + + /* remove the final schwa */ + if (is_devanagari(prev) && !is_devanagari(c)) { + if (latin[done - 1] == SCHWA_CHARACTER) { + latin[--done] = '\0'; + } + } + + utf8_pack_char(latin + done, c); + done += len; + } +next: + if (c == 0) + break; + prev = c; + } + + *(latin + done - 1) = '\0'; + + *ret = latin; + + return 0; +} diff --git a/hindi.h b/hindi.h new file mode 100644 index 0000000..960f554 --- /dev/null +++ b/hindi.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __HINDI_H +#define __HINDI_H + +int transcript_devanagari_to_hindi(const char *devanagari, char **ret); + +#endif /* __HINDI_H */ diff --git a/main.c b/main.c index f33781f..79b8cc4 100644 --- a/main.c +++ b/main.c @@ -1,14 +1,16 @@ #include "compat.h" #include "transliteration.h" -#include "transcription.h" +#include "czech.h" +#include "hindi.h" #include "velthuis.h" #include "config.h" #define FLAG_REVERSE 1 << 0 #define FLAG_VELTHUIS 1 << 1 #define FLAG_CZECH 1 << 2 -#define FLAG_ASCII 1 << 3 -#define FLAG_DEVANAGARI 1 << 4 +#define FLAG_HINDI 1 << 3 +#define FLAG_ASCII 1 << 4 +#define FLAG_DEVANAGARI 1 << 5 static const char *usage_str = PROGNAME ", a helper for Sanskrit transliteration.\n" @@ -23,6 +25,7 @@ static const char *usage_str = " -a, --ascii convert a Devanagari text to Velthuis text rather than to IAST\n" " -d, --devanagari when encoding, output a Devanagari text rather than IAST\n" " -c, --czech transcript Devanagari to Czech language (experimental)\n" + " -H, --hindi transcript Hindi from Devanagari to Latin (experimental)\n" " -h, --help show this help and exit\n" " -v, --version show version number and exit\n" "\n" @@ -38,7 +41,7 @@ static const char *usage_str = "\n" " For more information see the iast(1) manual page.\n"; -static const char *short_opts = "f:readchv"; +static const char *short_opts = "f:readcHhv"; static const struct option long_opts[] = { {"file", required_argument, 0, 'f'}, @@ -48,6 +51,7 @@ static const struct option long_opts[] = { {"ascii", no_argument, 0, 'a'}, {"devanagari", no_argument, 0, 'd'}, {"czech", no_argument, 0, 'c'}, + {"hindi", no_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, {"version", no_argument, 0, 'v'}, {0, 0, 0, 0} @@ -104,6 +108,9 @@ static int process_input(const char *input, char **out, unsigned int flags) if (flags & FLAG_CZECH) return transcript_devanagari_to_czech(input, out); + if (flags & FLAG_HINDI) + return transcript_devanagari_to_hindi(input, out); + if (flags & FLAG_ASCII) { ret = transliterate_devanagari_to_latin(input, &tmp); if (ret != 0) @@ -128,9 +135,6 @@ static int process_string(const char *input, unsigned int flags) case 0: fprintf(stdout, "%s", output); break; - case EHINDI: - error("the input text is Hindi."); - break; default: error("unexpected error."); break; @@ -225,6 +229,9 @@ int main(int argc, const char **argv) case 'c': flags |= FLAG_CZECH; break; + case 'H': + flags |= FLAG_HINDI; + break; case 'h': print_usage(); return 0; diff --git a/tests/transcript.c b/tests/czech.c similarity index 73% rename from tests/transcript.c rename to tests/czech.c index 93b5b24..186b6e1 100644 --- a/tests/transcript.c +++ b/tests/czech.c @@ -1,7 +1,7 @@ #include "test.h" -#include "transcript.h" +#include "czech.h" #include "../compat.h" -#include "../transcription.h" +#include "../czech.h" static void test_transcript(const char *devanagari, const char *latin) { @@ -34,18 +34,7 @@ START_TEST(test_transcript_devanagari_to_czech) } END_TEST -START_TEST(test_transcript_detect_hindi) -{ - char *czech; - int ret; - - ret = transcript_devanagari_to_czech("लड़की", &czech); - ck_assert_int_eq(EHINDI, ret); -} -END_TEST - -void register_transcript_tests(TCase *test_case) +void register_transcript_czech_tests(TCase *test_case) { tcase_add_test(test_case, test_transcript_devanagari_to_czech); - tcase_add_test(test_case, test_transcript_detect_hindi); } diff --git a/tests/czech.h b/tests/czech.h new file mode 100644 index 0000000..68b022e --- /dev/null +++ b/tests/czech.h @@ -0,0 +1,8 @@ +#ifndef __TEST_CZECH_H +#define __TEST_CZECH_H + +#include + +void register_transcript_czech_tests(TCase *test_case); + +#endif /* __TEST_CZECH_H */ diff --git a/tests/hindi.c b/tests/hindi.c new file mode 100644 index 0000000..1e43af8 --- /dev/null +++ b/tests/hindi.c @@ -0,0 +1,30 @@ +#include "test.h" +#include "hindi.h" +#include "../compat.h" +#include "../hindi.h" + +static void test_transcript(const char *devanagari, const char *latin) +{ + char *hindi; + int ret; + + ret = transcript_devanagari_to_hindi(devanagari, &hindi); + ck_assert_int_eq(0, ret); + ck_assert_str_eq(latin, hindi); + free(hindi); +} + +START_TEST(test_transcript_devanagari_to_hindi) +{ + test_transcript("क़ ख़ ग़ ज़ झ़ ड़ ढ़ फ़", "k kh g z zh d dh f"); /* composite */ + test_transcript("क़ ख़ ग़ ज़ ड़ ढ़ फ़", "k kh g z d dh f"); /* independent */ + test_transcript("कभी ख़ुशी कभी ग़म", "kabhee khushee kabhee gam"); + test_transcript("मैं एक लड़का हूँ और तुम एक लड़की हो", "main ek ladaka hoon aur tum ek ladakee ho"); + test_transcript("स्कॉट्लैण्ड ऑरेंज", "skotlaind orenj"); /* o */ +} +END_TEST + +void register_transcript_hindi_tests(TCase *test_case) +{ + tcase_add_test(test_case, test_transcript_devanagari_to_hindi); +} diff --git a/tests/hindi.h b/tests/hindi.h new file mode 100644 index 0000000..e30e42b --- /dev/null +++ b/tests/hindi.h @@ -0,0 +1,8 @@ +#ifndef __TEST_HINDI_H +#define __TEST_HINDI_H + +#include + +void register_transcript_hindi_tests(TCase *test_case); + +#endif /* __TEST_HINDI_H */ diff --git a/tests/integration.c b/tests/integration.c index 24ef97b..df3bcba 100644 --- a/tests/integration.c +++ b/tests/integration.c @@ -88,13 +88,20 @@ START_TEST(test_transliterate_arguments) } END_TEST -START_TEST(test_transcript) +START_TEST(test_transcript_czech) { test_output("./iast -c \"भगवद्गीता\"", "bhagavadgíta\n"); test_output("./iast --czech \"तन्त्रशास्त्रम्\"", "tantrašástra\n"); } END_TEST +START_TEST(test_transcript_hindi) +{ + test_output("./iast -H \"हिन्दी\"", "hindee\n"); + test_output("./iast --hindi \"लड़की\"", "ladakee\n"); +} +END_TEST + START_TEST(test_velthuis) { test_output("./iast \".rta.m ca satyam\" -e", "ṛtaṃ ca satyam\n"); @@ -141,7 +148,6 @@ START_TEST(test_errors) { test_output("./iast -x 2>&1", "[iast] error: unrecognised option '-x'.\n"); test_output("./iast -f xxx 2>&1", "[iast] error: failed to read file 'xxx'.\n"); - test_output("./iast \u0921\u093c 2>&1", "[iast] error: the input text is Hindi.\n"); } END_TEST @@ -149,7 +155,8 @@ void register_integration_tests(TCase *test_case) { tcase_add_test(test_case, test_transliterate_files); tcase_add_test(test_case, test_transliterate_arguments); - tcase_add_test(test_case, test_transcript); + tcase_add_test(test_case, test_transcript_czech); + tcase_add_test(test_case, test_transcript_hindi); tcase_add_test(test_case, test_velthuis); tcase_add_test(test_case, test_ascii); tcase_add_test(test_case, test_version); diff --git a/tests/main.c b/tests/main.c index e025e5c..cd94582 100644 --- a/tests/main.c +++ b/tests/main.c @@ -1,6 +1,7 @@ #include "test.h" #include "translit.h" -#include "transcript.h" +#include "czech.h" +#include "hindi.h" #include "velthuis.h" #include "utf8.h" #include "integration.h" @@ -14,7 +15,8 @@ static Suite *create_test_suite() test_case = tcase_create(NULL); register_translit_tests(test_case); - register_transcript_tests(test_case); + register_transcript_czech_tests(test_case); + register_transcript_hindi_tests(test_case); register_velthuis_encoder_tests(test_case); register_utf8_tests(test_case); diff --git a/tests/transcript.h b/tests/transcript.h deleted file mode 100644 index efc007d..0000000 --- a/tests/transcript.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __TEST_TRANSCRIPT_H -#define __TEST_TRANSCRIPT_H - -#include - -void register_transcript_tests(TCase *test_case); - -#endif /* __TEST_TRANSCRIPT_H */ diff --git a/tests/translit.c b/tests/translit.c index c17a337..3f5643b 100644 --- a/tests/translit.c +++ b/tests/translit.c @@ -84,14 +84,6 @@ START_TEST(test_translit_zero_width_joiner) } END_TEST -START_TEST(test_translit_detect_hindi) -{ - char *hindi = NULL; - int ret = transliterate_devanagari_to_latin("लड़की", &hindi); - ck_assert_int_eq(EHINDI, ret); -} -END_TEST - void register_translit_tests(TCase *test_case) { tcase_add_test(test_case, test_translit_devanagari_to_latin); @@ -99,5 +91,4 @@ void register_translit_tests(TCase *test_case) tcase_add_test(test_case, test_translit_lla_sylable); tcase_add_test(test_case, test_translit_candrabindu); tcase_add_test(test_case, test_translit_zero_width_joiner); - tcase_add_test(test_case, test_translit_detect_hindi); } diff --git a/transliteration.c b/transliteration.c index b1fba7d..2f8b1bc 100644 --- a/transliteration.c +++ b/transliteration.c @@ -8,7 +8,6 @@ #define SCHWA_CHARACTER 'a' #define ZERO_WIDTH_JOINER 0x200d #define VIRAMA 0x094d -#define NUKTA 0x093c #define CHUNKSIZE 1024 static struct translit_letter table[] = { @@ -136,11 +135,6 @@ int transliterate_devanagari_to_latin(const char *devanagari, char **ret) len = utf8_char_length(c); src += len; - if (c == NUKTA) { - *ret = NULL; - return EHINDI; - } - letter = letter_by_code(c); if (letter) { switch (letter->type) {