implement transcription of hindi

This commit is contained in:
Vlasta Vesely 2021-12-21 21:55:16 +01:00
parent 5685e83933
commit b881497064
No known key found for this signature in database
GPG key ID: EB0E649DC0DFCC22
16 changed files with 308 additions and 65 deletions

View file

@ -24,10 +24,10 @@ LFLAGS = @COVERAGE_LFLAGS@
TEST_CFLAGS = @CFLAGS@ @CHECK_CFLAGS@ TEST_CFLAGS = @CFLAGS@ @CHECK_CFLAGS@
TEST_LFLAGS = @CHECK_LIBS@ @COVERAGE_LFLAGS@ TEST_LFLAGS = @CHECK_LIBS@ @COVERAGE_LFLAGS@
OBJECTS = transliteration.o transcription.o utf8.o velthuis.o OBJECTS = transliteration.o czech.o hindi.o utf8.o velthuis.o
TEST_OBJECTS = tests/main.o tests/translit.o tests/transcript.o \ TEST_OBJECTS = tests/main.o tests/translit.o tests/czech.o \
tests/velthuis.o tests/utf8.o tests/integration.o tests/hindi.o tests/velthuis.o tests/utf8.o tests/integration.o
AUX_FILES = Makefile configure aclocal.m4 install-sh config.h* *.log \ AUX_FILES = Makefile configure aclocal.m4 install-sh config.h* *.log \
*.status *.cache *.status *.cache

View file

@ -12,8 +12,4 @@
#define ARRAY_SIZE(a) sizeof(a) / sizeof(*a) #define ARRAY_SIZE(a) sizeof(a) / sizeof(*a)
enum err {
EHINDI = 1
};
#endif /* __COMPAT_H */ #endif /* __COMPAT_H */

View file

@ -2,7 +2,7 @@
/* https://cs.wikipedia.org/wiki/Wikipedie:Transkripce_hind%C5%A1tiny */ /* https://cs.wikipedia.org/wiki/Wikipedie:Transkripce_hind%C5%A1tiny */
#include "compat.h" #include "compat.h"
#include "transcription.h" #include "czech.h"
#include "transliteration.h" #include "transliteration.h"
#include "utf8.h" #include "utf8.h"
@ -193,11 +193,6 @@ int transcript_devanagari_to_czech(const char *devanagari, char **ret)
if (c == ZERO_WIDTH_JOINER) if (c == ZERO_WIDTH_JOINER)
continue; continue;
if (c == NUKTA) {
*ret = NULL;
return EHINDI;
}
nasal_consonants_filter(latin, &done, prev, c); nasal_consonants_filter(latin, &done, prev, c);
letter = letter_by_code(c); letter = letter_by_code(c);

View file

@ -1,8 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
#ifndef __TRANSCRIPTION_H #ifndef __CZECH_H
#define __TRANSCRIPTION_H #define __CZECH_H
int transcript_devanagari_to_czech(const char *devanagari, char **ret); int transcript_devanagari_to_czech(const char *devanagari, char **ret);
#endif /* __TRANSCRIPTION_H */ #endif /* __CZECH_H */

216
hindi.c Normal file
View file

@ -0,0 +1,216 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* https://cs.wikipedia.org/wiki/Wikipedie:Transkripce_hind%C5%A1tiny */
#include "compat.h"
#include "hindi.h"
#include "transliteration.h"
#include "utf8.h"
#define SCHWA_CHARACTER 'a'
#define ZERO_WIDTH_JOINER 0x200d
#define NUKTA 0x093c
#define CHUNKSIZE 1024
static struct translit_letter table[] = {
/* Special characters */
{0x0950, SPECIAL, "aum"}, /* aum */
/* Vowels */
{0x0910, VOWEL, "ai"}, /* 01 */
{0x0914, VOWEL, "au"}, /* 02 */
{0x0905, VOWEL, "a"}, /* 03 */
{0x0906, VOWEL, "aa"}, /* 04 */
{0x0907, VOWEL, "i"}, /* 05 */
{0x0908, VOWEL, "ee"}, /* 06 */
{0x0909, VOWEL, "u"}, /* 07 */
{0x090a, VOWEL, "oo"}, /* 08 */
{0x090b, VOWEL, "r"}, /* 09 */
{0x0960, VOWEL, "rr"}, /* 10 */
{0x090c, VOWEL, "l"}, /* 11 */
{0x0961, VOWEL, "ll"}, /* 12 */
{0x090f, VOWEL, "e"}, /* 13 */
{0x0913, VOWEL, "o"}, /* 14 */
{0x0911, VOWEL, "o"}, /* candra o */
{0x0912, VOWEL, "o"}, /* short o */
/* Consonants */
{0x0916, CONSONANT, "kh"}, /* 01 */
{0x0918, CONSONANT, "gh"}, /* 02 */
{0x091b, CONSONANT, "chh"}, /* 03 */
{0x091d, CONSONANT, "jh"}, /* 04 */
{0x0920, CONSONANT, "th"}, /* 05 */
{0x0922, CONSONANT, "dh"}, /* 06 */
{0x0925, CONSONANT, "th"}, /* 07 */
{0x0927, CONSONANT, "dh"}, /* 08 */
{0x092b, CONSONANT, "ph"}, /* 09 */
{0x092d, CONSONANT, "bh"}, /* 10 */
{0x0915, CONSONANT, "k"}, /* 11 */
{0x0917, CONSONANT, "g"}, /* 12 */
{0x0919, CONSONANT, "n"}, /* 13 */
{0x0939, CONSONANT, "h"}, /* 14 */
{0x091a, CONSONANT, "ch"}, /* 15 */
{0x091c, CONSONANT, "j"}, /* 16 */
{0x091e, CONSONANT, "n"}, /* 17 */
{0x092f, CONSONANT, "y"}, /* 18 */
{0x0936, CONSONANT, "sh"}, /* 19 */
{0x091f, CONSONANT, "t"}, /* 20 */
{0x0921, CONSONANT, "d"}, /* 21 */
{0x0923, CONSONANT, "n"}, /* 22 */
{0x0930, CONSONANT, "r"}, /* 23 */
{0x0937, CONSONANT, "sh"}, /* 24 */
{0x0924, CONSONANT, "t"}, /* 25 */
{0x0926, CONSONANT, "d"}, /* 26 */
{0x0928, CONSONANT, "n"}, /* 27 */
{0x0932, CONSONANT, "l"}, /* 28 */
{0x0938, CONSONANT, "s"}, /* 29 */
{0x092a, CONSONANT, "p"}, /* 30 */
{0x092c, CONSONANT, "b"}, /* 31 */
{0x092e, CONSONANT, "m"}, /* 32 */
{0x0935, CONSONANT, "v"}, /* 33 */
{0x0933, CONSONANT, "l"}, /* (.l) */
/* Additional consonants - idependent versions */
{0x0958, CONSONANT, "k"},
{0x0959, CONSONANT, "kh"},
{0x095a, CONSONANT, "g"},
{0x095b, CONSONANT, "z"},
{0x095c, CONSONANT, "d"},
{0x095d, CONSONANT, "dh"},
{0x095e, CONSONANT, "f"},
/* Codas */
{0x0902, CODA, "n"}, /* anusvara */
{0x0903, CODA, "h"}, /* visarga */
{0x093d, CODA, "'"}, /* avagrada (') */
{0x0901, CODA, "n"}, /* candrabindu */
/* Numbers */
{0x0966, NUMBER, "0"},
{0x0967, NUMBER, "1"},
{0x0968, NUMBER, "2"},
{0x0969, NUMBER, "3"},
{0x096a, NUMBER, "4"},
{0x096b, NUMBER, "5"},
{0x096c, NUMBER, "6"},
{0x096d, NUMBER, "7"},
{0x096e, NUMBER, "8"},
{0x096f, NUMBER, "9"},
/* Diacritic modifiers */
{0x0948, VOWEL_SIGN, "ai"},
{0x094c, VOWEL_SIGN, "au"},
{0x093e, VOWEL_SIGN, "aa"},
{0x093f, VOWEL_SIGN, "i"},
{0x0940, VOWEL_SIGN, "ee"},
{0x0941, VOWEL_SIGN, "u"},
{0x0942, VOWEL_SIGN, "oo"},
{0x0943, VOWEL_SIGN, "r"},
{0x0944, VOWEL_SIGN, "rr"},
{0x0962, VOWEL_SIGN, "l"},
{0x0963, VOWEL_SIGN, "ll"},
{0x0947, VOWEL_SIGN, "e"},
{0x094b, VOWEL_SIGN, "o"},
{0x0949, VOWEL_SIGN, "o"}, /* candra o */
{0x094a, VOWEL_SIGN, "o"}, /* short o */
{0x094d, VOWEL_SIGN, ""}, /* virama */
{0x0965, CODA, "||"}, /* double danda */
{0x0964, CODA, "|"}, /* danda */
};
static struct translit_letter *letter_by_code(unsigned int c)
{
unsigned int i;
for (i = 0; i < ARRAY_SIZE(table); i++) {
if (table[i].code == c) {
return table + i;
}
}
return NULL;
}
static void nukta_filter(char *latin, unsigned int *pos, unsigned int prev)
{
switch (prev) {
case 0x091c: /* z */
latin[*pos - 2] = 'z';
break;
case 0x091d: /* zh */
latin[*pos - 3] = 'z';
break;
case 0x092b:
strcpy(latin + *pos - 3, "fa");
*pos = *pos - 1;
break;
}
}
int transcript_devanagari_to_hindi(const char *devanagari, char **ret)
{
struct translit_letter *letter;
unsigned int c, prev = 0, alloc = 0, done = 0, len;
const char *src = devanagari;
char *latin = NULL;
while (1) {
if (alloc < done + UNICODE_MAX_LENGTH) {
latin = realloc(latin, alloc + CHUNKSIZE);
alloc += CHUNKSIZE;
}
c = utf8_unpack_char(src);
len = utf8_char_length(c);
src += len;
if (c == ZERO_WIDTH_JOINER)
continue;
letter = letter_by_code(c);
if (letter) {
switch (letter->type) {
case CONSONANT:
strcpy(latin + done, letter->data);
done += strlen(letter->data);
*(latin + done++) = SCHWA_CHARACTER;
break;
case VOWEL_SIGN:
if (done) {
/* delete the inherent schwa */
done--;
}
default:
strcpy(latin + done, letter->data);
done += strlen(letter->data);
break;
}
} else {
if (done && c == NUKTA) {
nukta_filter(latin, &done, prev);
goto next;
}
/* remove the final schwa */
if (is_devanagari(prev) && !is_devanagari(c)) {
if (latin[done - 1] == SCHWA_CHARACTER) {
latin[--done] = '\0';
}
}
utf8_pack_char(latin + done, c);
done += len;
}
next:
if (c == 0)
break;
prev = c;
}
*(latin + done - 1) = '\0';
*ret = latin;
return 0;
}

8
hindi.h Normal file
View file

@ -0,0 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __HINDI_H
#define __HINDI_H
int transcript_devanagari_to_hindi(const char *devanagari, char **ret);
#endif /* __HINDI_H */

21
main.c
View file

@ -1,14 +1,16 @@
#include "compat.h" #include "compat.h"
#include "transliteration.h" #include "transliteration.h"
#include "transcription.h" #include "czech.h"
#include "hindi.h"
#include "velthuis.h" #include "velthuis.h"
#include "config.h" #include "config.h"
#define FLAG_REVERSE 1 << 0 #define FLAG_REVERSE 1 << 0
#define FLAG_VELTHUIS 1 << 1 #define FLAG_VELTHUIS 1 << 1
#define FLAG_CZECH 1 << 2 #define FLAG_CZECH 1 << 2
#define FLAG_ASCII 1 << 3 #define FLAG_HINDI 1 << 3
#define FLAG_DEVANAGARI 1 << 4 #define FLAG_ASCII 1 << 4
#define FLAG_DEVANAGARI 1 << 5
static const char *usage_str = static const char *usage_str =
PROGNAME ", a helper for Sanskrit transliteration.\n" PROGNAME ", a helper for Sanskrit transliteration.\n"
@ -23,6 +25,7 @@ static const char *usage_str =
" -a, --ascii convert a Devanagari text to Velthuis text rather than to IAST\n" " -a, --ascii convert a Devanagari text to Velthuis text rather than to IAST\n"
" -d, --devanagari when encoding, output a Devanagari text rather than IAST\n" " -d, --devanagari when encoding, output a Devanagari text rather than IAST\n"
" -c, --czech transcript Devanagari to Czech language (experimental)\n" " -c, --czech transcript Devanagari to Czech language (experimental)\n"
" -H, --hindi transcript Hindi from Devanagari to Latin (experimental)\n"
" -h, --help show this help and exit\n" " -h, --help show this help and exit\n"
" -v, --version show version number and exit\n" " -v, --version show version number and exit\n"
"\n" "\n"
@ -38,7 +41,7 @@ static const char *usage_str =
"\n" "\n"
" For more information see the iast(1) manual page.\n"; " For more information see the iast(1) manual page.\n";
static const char *short_opts = "f:readchv"; static const char *short_opts = "f:readcHhv";
static const struct option long_opts[] = { static const struct option long_opts[] = {
{"file", required_argument, 0, 'f'}, {"file", required_argument, 0, 'f'},
@ -48,6 +51,7 @@ static const struct option long_opts[] = {
{"ascii", no_argument, 0, 'a'}, {"ascii", no_argument, 0, 'a'},
{"devanagari", no_argument, 0, 'd'}, {"devanagari", no_argument, 0, 'd'},
{"czech", no_argument, 0, 'c'}, {"czech", no_argument, 0, 'c'},
{"hindi", no_argument, 0, 'H'},
{"help", no_argument, 0, 'h'}, {"help", no_argument, 0, 'h'},
{"version", no_argument, 0, 'v'}, {"version", no_argument, 0, 'v'},
{0, 0, 0, 0} {0, 0, 0, 0}
@ -104,6 +108,9 @@ static int process_input(const char *input, char **out, unsigned int flags)
if (flags & FLAG_CZECH) if (flags & FLAG_CZECH)
return transcript_devanagari_to_czech(input, out); return transcript_devanagari_to_czech(input, out);
if (flags & FLAG_HINDI)
return transcript_devanagari_to_hindi(input, out);
if (flags & FLAG_ASCII) { if (flags & FLAG_ASCII) {
ret = transliterate_devanagari_to_latin(input, &tmp); ret = transliterate_devanagari_to_latin(input, &tmp);
if (ret != 0) if (ret != 0)
@ -128,9 +135,6 @@ static int process_string(const char *input, unsigned int flags)
case 0: case 0:
fprintf(stdout, "%s", output); fprintf(stdout, "%s", output);
break; break;
case EHINDI:
error("the input text is Hindi.");
break;
default: default:
error("unexpected error."); error("unexpected error.");
break; break;
@ -225,6 +229,9 @@ int main(int argc, const char **argv)
case 'c': case 'c':
flags |= FLAG_CZECH; flags |= FLAG_CZECH;
break; break;
case 'H':
flags |= FLAG_HINDI;
break;
case 'h': case 'h':
print_usage(); print_usage();
return 0; return 0;

View file

@ -1,7 +1,7 @@
#include "test.h" #include "test.h"
#include "transcript.h" #include "czech.h"
#include "../compat.h" #include "../compat.h"
#include "../transcription.h" #include "../czech.h"
static void test_transcript(const char *devanagari, const char *latin) static void test_transcript(const char *devanagari, const char *latin)
{ {
@ -34,18 +34,7 @@ START_TEST(test_transcript_devanagari_to_czech)
} }
END_TEST END_TEST
START_TEST(test_transcript_detect_hindi) void register_transcript_czech_tests(TCase *test_case)
{
char *czech;
int ret;
ret = transcript_devanagari_to_czech("लड़की", &czech);
ck_assert_int_eq(EHINDI, ret);
}
END_TEST
void register_transcript_tests(TCase *test_case)
{ {
tcase_add_test(test_case, test_transcript_devanagari_to_czech); tcase_add_test(test_case, test_transcript_devanagari_to_czech);
tcase_add_test(test_case, test_transcript_detect_hindi);
} }

8
tests/czech.h Normal file
View file

@ -0,0 +1,8 @@
#ifndef __TEST_CZECH_H
#define __TEST_CZECH_H
#include <check.h>
void register_transcript_czech_tests(TCase *test_case);
#endif /* __TEST_CZECH_H */

30
tests/hindi.c Normal file
View file

@ -0,0 +1,30 @@
#include "test.h"
#include "hindi.h"
#include "../compat.h"
#include "../hindi.h"
static void test_transcript(const char *devanagari, const char *latin)
{
char *hindi;
int ret;
ret = transcript_devanagari_to_hindi(devanagari, &hindi);
ck_assert_int_eq(0, ret);
ck_assert_str_eq(latin, hindi);
free(hindi);
}
START_TEST(test_transcript_devanagari_to_hindi)
{
test_transcript("क़ ख़ ग़ ज़ झ़ ड़ ढ़ फ़", "k kh g z zh d dh f"); /* composite */
test_transcript("क़ ख़ ग़ ज़ ड़ ढ़ फ़", "k kh g z d dh f"); /* independent */
test_transcript("कभी ख़ुशी कभी ग़म", "kabhee khushee kabhee gam");
test_transcript("मैं एक लड़का हूँ और तुम एक लड़की हो", "main ek ladaka hoon aur tum ek ladakee ho");
test_transcript("स्कॉट्लैण्ड ऑरेंज", "skotlaind orenj"); /* o */
}
END_TEST
void register_transcript_hindi_tests(TCase *test_case)
{
tcase_add_test(test_case, test_transcript_devanagari_to_hindi);
}

8
tests/hindi.h Normal file
View file

@ -0,0 +1,8 @@
#ifndef __TEST_HINDI_H
#define __TEST_HINDI_H
#include <check.h>
void register_transcript_hindi_tests(TCase *test_case);
#endif /* __TEST_HINDI_H */

View file

@ -88,13 +88,20 @@ START_TEST(test_transliterate_arguments)
} }
END_TEST END_TEST
START_TEST(test_transcript) START_TEST(test_transcript_czech)
{ {
test_output("./iast -c \"भगवद्गीता\"", "bhagavadgíta\n"); test_output("./iast -c \"भगवद्गीता\"", "bhagavadgíta\n");
test_output("./iast --czech \"तन्त्रशास्त्रम्\"", "tantrašástra\n"); test_output("./iast --czech \"तन्त्रशास्त्रम्\"", "tantrašástra\n");
} }
END_TEST END_TEST
START_TEST(test_transcript_hindi)
{
test_output("./iast -H \"हिन्दी\"", "hindee\n");
test_output("./iast --hindi \"लड़की\"", "ladakee\n");
}
END_TEST
START_TEST(test_velthuis) START_TEST(test_velthuis)
{ {
test_output("./iast \".rta.m ca satyam\" -e", "ṛtaṃ ca satyam\n"); test_output("./iast \".rta.m ca satyam\" -e", "ṛtaṃ ca satyam\n");
@ -141,7 +148,6 @@ START_TEST(test_errors)
{ {
test_output("./iast -x 2>&1", "[iast] error: unrecognised option '-x'.\n"); test_output("./iast -x 2>&1", "[iast] error: unrecognised option '-x'.\n");
test_output("./iast -f xxx 2>&1", "[iast] error: failed to read file 'xxx'.\n"); test_output("./iast -f xxx 2>&1", "[iast] error: failed to read file 'xxx'.\n");
test_output("./iast \u0921\u093c 2>&1", "[iast] error: the input text is Hindi.\n");
} }
END_TEST END_TEST
@ -149,7 +155,8 @@ void register_integration_tests(TCase *test_case)
{ {
tcase_add_test(test_case, test_transliterate_files); tcase_add_test(test_case, test_transliterate_files);
tcase_add_test(test_case, test_transliterate_arguments); tcase_add_test(test_case, test_transliterate_arguments);
tcase_add_test(test_case, test_transcript); tcase_add_test(test_case, test_transcript_czech);
tcase_add_test(test_case, test_transcript_hindi);
tcase_add_test(test_case, test_velthuis); tcase_add_test(test_case, test_velthuis);
tcase_add_test(test_case, test_ascii); tcase_add_test(test_case, test_ascii);
tcase_add_test(test_case, test_version); tcase_add_test(test_case, test_version);

View file

@ -1,6 +1,7 @@
#include "test.h" #include "test.h"
#include "translit.h" #include "translit.h"
#include "transcript.h" #include "czech.h"
#include "hindi.h"
#include "velthuis.h" #include "velthuis.h"
#include "utf8.h" #include "utf8.h"
#include "integration.h" #include "integration.h"
@ -14,7 +15,8 @@ static Suite *create_test_suite()
test_case = tcase_create(NULL); test_case = tcase_create(NULL);
register_translit_tests(test_case); register_translit_tests(test_case);
register_transcript_tests(test_case); register_transcript_czech_tests(test_case);
register_transcript_hindi_tests(test_case);
register_velthuis_encoder_tests(test_case); register_velthuis_encoder_tests(test_case);
register_utf8_tests(test_case); register_utf8_tests(test_case);

View file

@ -1,8 +0,0 @@
#ifndef __TEST_TRANSCRIPT_H
#define __TEST_TRANSCRIPT_H
#include <check.h>
void register_transcript_tests(TCase *test_case);
#endif /* __TEST_TRANSCRIPT_H */

View file

@ -84,14 +84,6 @@ START_TEST(test_translit_zero_width_joiner)
} }
END_TEST END_TEST
START_TEST(test_translit_detect_hindi)
{
char *hindi = NULL;
int ret = transliterate_devanagari_to_latin("लड़की", &hindi);
ck_assert_int_eq(EHINDI, ret);
}
END_TEST
void register_translit_tests(TCase *test_case) void register_translit_tests(TCase *test_case)
{ {
tcase_add_test(test_case, test_translit_devanagari_to_latin); tcase_add_test(test_case, test_translit_devanagari_to_latin);
@ -99,5 +91,4 @@ void register_translit_tests(TCase *test_case)
tcase_add_test(test_case, test_translit_lla_sylable); tcase_add_test(test_case, test_translit_lla_sylable);
tcase_add_test(test_case, test_translit_candrabindu); tcase_add_test(test_case, test_translit_candrabindu);
tcase_add_test(test_case, test_translit_zero_width_joiner); tcase_add_test(test_case, test_translit_zero_width_joiner);
tcase_add_test(test_case, test_translit_detect_hindi);
} }

View file

@ -8,7 +8,6 @@
#define SCHWA_CHARACTER 'a' #define SCHWA_CHARACTER 'a'
#define ZERO_WIDTH_JOINER 0x200d #define ZERO_WIDTH_JOINER 0x200d
#define VIRAMA 0x094d #define VIRAMA 0x094d
#define NUKTA 0x093c
#define CHUNKSIZE 1024 #define CHUNKSIZE 1024
static struct translit_letter table[] = { static struct translit_letter table[] = {
@ -136,11 +135,6 @@ int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
len = utf8_char_length(c); len = utf8_char_length(c);
src += len; src += len;
if (c == NUKTA) {
*ret = NULL;
return EHINDI;
}
letter = letter_by_code(c); letter = letter_by_code(c);
if (letter) { if (letter) {
switch (letter->type) { switch (letter->type) {