implement transcription of hindi
This commit is contained in:
parent
5685e83933
commit
b881497064
16 changed files with 308 additions and 65 deletions
|
@ -24,10 +24,10 @@ LFLAGS = @COVERAGE_LFLAGS@
|
||||||
TEST_CFLAGS = @CFLAGS@ @CHECK_CFLAGS@
|
TEST_CFLAGS = @CFLAGS@ @CHECK_CFLAGS@
|
||||||
TEST_LFLAGS = @CHECK_LIBS@ @COVERAGE_LFLAGS@
|
TEST_LFLAGS = @CHECK_LIBS@ @COVERAGE_LFLAGS@
|
||||||
|
|
||||||
OBJECTS = transliteration.o transcription.o utf8.o velthuis.o
|
OBJECTS = transliteration.o czech.o hindi.o utf8.o velthuis.o
|
||||||
|
|
||||||
TEST_OBJECTS = tests/main.o tests/translit.o tests/transcript.o \
|
TEST_OBJECTS = tests/main.o tests/translit.o tests/czech.o \
|
||||||
tests/velthuis.o tests/utf8.o tests/integration.o
|
tests/hindi.o tests/velthuis.o tests/utf8.o tests/integration.o
|
||||||
|
|
||||||
AUX_FILES = Makefile configure aclocal.m4 install-sh config.h* *.log \
|
AUX_FILES = Makefile configure aclocal.m4 install-sh config.h* *.log \
|
||||||
*.status *.cache
|
*.status *.cache
|
||||||
|
|
4
compat.h
4
compat.h
|
@ -12,8 +12,4 @@
|
||||||
|
|
||||||
#define ARRAY_SIZE(a) sizeof(a) / sizeof(*a)
|
#define ARRAY_SIZE(a) sizeof(a) / sizeof(*a)
|
||||||
|
|
||||||
enum err {
|
|
||||||
EHINDI = 1
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif /* __COMPAT_H */
|
#endif /* __COMPAT_H */
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
/* https://cs.wikipedia.org/wiki/Wikipedie:Transkripce_hind%C5%A1tiny */
|
/* https://cs.wikipedia.org/wiki/Wikipedie:Transkripce_hind%C5%A1tiny */
|
||||||
|
|
||||||
#include "compat.h"
|
#include "compat.h"
|
||||||
#include "transcription.h"
|
#include "czech.h"
|
||||||
#include "transliteration.h"
|
#include "transliteration.h"
|
||||||
#include "utf8.h"
|
#include "utf8.h"
|
||||||
|
|
||||||
|
@ -193,11 +193,6 @@ int transcript_devanagari_to_czech(const char *devanagari, char **ret)
|
||||||
if (c == ZERO_WIDTH_JOINER)
|
if (c == ZERO_WIDTH_JOINER)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (c == NUKTA) {
|
|
||||||
*ret = NULL;
|
|
||||||
return EHINDI;
|
|
||||||
}
|
|
||||||
|
|
||||||
nasal_consonants_filter(latin, &done, prev, c);
|
nasal_consonants_filter(latin, &done, prev, c);
|
||||||
|
|
||||||
letter = letter_by_code(c);
|
letter = letter_by_code(c);
|
|
@ -1,8 +1,8 @@
|
||||||
/* SPDX-License-Identifier: GPL-2.0 */
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
|
|
||||||
#ifndef __TRANSCRIPTION_H
|
#ifndef __CZECH_H
|
||||||
#define __TRANSCRIPTION_H
|
#define __CZECH_H
|
||||||
|
|
||||||
int transcript_devanagari_to_czech(const char *devanagari, char **ret);
|
int transcript_devanagari_to_czech(const char *devanagari, char **ret);
|
||||||
|
|
||||||
#endif /* __TRANSCRIPTION_H */
|
#endif /* __CZECH_H */
|
216
hindi.c
Normal file
216
hindi.c
Normal file
|
@ -0,0 +1,216 @@
|
||||||
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
|
/* https://cs.wikipedia.org/wiki/Wikipedie:Transkripce_hind%C5%A1tiny */
|
||||||
|
|
||||||
|
#include "compat.h"
|
||||||
|
#include "hindi.h"
|
||||||
|
#include "transliteration.h"
|
||||||
|
#include "utf8.h"
|
||||||
|
|
||||||
|
#define SCHWA_CHARACTER 'a'
|
||||||
|
#define ZERO_WIDTH_JOINER 0x200d
|
||||||
|
#define NUKTA 0x093c
|
||||||
|
#define CHUNKSIZE 1024
|
||||||
|
|
||||||
|
static struct translit_letter table[] = {
|
||||||
|
|
||||||
|
/* Special characters */
|
||||||
|
{0x0950, SPECIAL, "aum"}, /* aum */
|
||||||
|
|
||||||
|
/* Vowels */
|
||||||
|
{0x0910, VOWEL, "ai"}, /* 01 */
|
||||||
|
{0x0914, VOWEL, "au"}, /* 02 */
|
||||||
|
{0x0905, VOWEL, "a"}, /* 03 */
|
||||||
|
{0x0906, VOWEL, "aa"}, /* 04 */
|
||||||
|
{0x0907, VOWEL, "i"}, /* 05 */
|
||||||
|
{0x0908, VOWEL, "ee"}, /* 06 */
|
||||||
|
{0x0909, VOWEL, "u"}, /* 07 */
|
||||||
|
{0x090a, VOWEL, "oo"}, /* 08 */
|
||||||
|
{0x090b, VOWEL, "r"}, /* 09 */
|
||||||
|
{0x0960, VOWEL, "rr"}, /* 10 */
|
||||||
|
{0x090c, VOWEL, "l"}, /* 11 */
|
||||||
|
{0x0961, VOWEL, "ll"}, /* 12 */
|
||||||
|
{0x090f, VOWEL, "e"}, /* 13 */
|
||||||
|
{0x0913, VOWEL, "o"}, /* 14 */
|
||||||
|
{0x0911, VOWEL, "o"}, /* candra o */
|
||||||
|
{0x0912, VOWEL, "o"}, /* short o */
|
||||||
|
|
||||||
|
/* Consonants */
|
||||||
|
{0x0916, CONSONANT, "kh"}, /* 01 */
|
||||||
|
{0x0918, CONSONANT, "gh"}, /* 02 */
|
||||||
|
{0x091b, CONSONANT, "chh"}, /* 03 */
|
||||||
|
{0x091d, CONSONANT, "jh"}, /* 04 */
|
||||||
|
{0x0920, CONSONANT, "th"}, /* 05 */
|
||||||
|
{0x0922, CONSONANT, "dh"}, /* 06 */
|
||||||
|
{0x0925, CONSONANT, "th"}, /* 07 */
|
||||||
|
{0x0927, CONSONANT, "dh"}, /* 08 */
|
||||||
|
{0x092b, CONSONANT, "ph"}, /* 09 */
|
||||||
|
{0x092d, CONSONANT, "bh"}, /* 10 */
|
||||||
|
{0x0915, CONSONANT, "k"}, /* 11 */
|
||||||
|
{0x0917, CONSONANT, "g"}, /* 12 */
|
||||||
|
{0x0919, CONSONANT, "n"}, /* 13 */
|
||||||
|
{0x0939, CONSONANT, "h"}, /* 14 */
|
||||||
|
{0x091a, CONSONANT, "ch"}, /* 15 */
|
||||||
|
{0x091c, CONSONANT, "j"}, /* 16 */
|
||||||
|
{0x091e, CONSONANT, "n"}, /* 17 */
|
||||||
|
{0x092f, CONSONANT, "y"}, /* 18 */
|
||||||
|
{0x0936, CONSONANT, "sh"}, /* 19 */
|
||||||
|
{0x091f, CONSONANT, "t"}, /* 20 */
|
||||||
|
{0x0921, CONSONANT, "d"}, /* 21 */
|
||||||
|
{0x0923, CONSONANT, "n"}, /* 22 */
|
||||||
|
{0x0930, CONSONANT, "r"}, /* 23 */
|
||||||
|
{0x0937, CONSONANT, "sh"}, /* 24 */
|
||||||
|
{0x0924, CONSONANT, "t"}, /* 25 */
|
||||||
|
{0x0926, CONSONANT, "d"}, /* 26 */
|
||||||
|
{0x0928, CONSONANT, "n"}, /* 27 */
|
||||||
|
{0x0932, CONSONANT, "l"}, /* 28 */
|
||||||
|
{0x0938, CONSONANT, "s"}, /* 29 */
|
||||||
|
{0x092a, CONSONANT, "p"}, /* 30 */
|
||||||
|
{0x092c, CONSONANT, "b"}, /* 31 */
|
||||||
|
{0x092e, CONSONANT, "m"}, /* 32 */
|
||||||
|
{0x0935, CONSONANT, "v"}, /* 33 */
|
||||||
|
{0x0933, CONSONANT, "l"}, /* (.l) */
|
||||||
|
|
||||||
|
/* Additional consonants - idependent versions */
|
||||||
|
{0x0958, CONSONANT, "k"},
|
||||||
|
{0x0959, CONSONANT, "kh"},
|
||||||
|
{0x095a, CONSONANT, "g"},
|
||||||
|
{0x095b, CONSONANT, "z"},
|
||||||
|
{0x095c, CONSONANT, "d"},
|
||||||
|
{0x095d, CONSONANT, "dh"},
|
||||||
|
{0x095e, CONSONANT, "f"},
|
||||||
|
|
||||||
|
/* Codas */
|
||||||
|
{0x0902, CODA, "n"}, /* anusvara */
|
||||||
|
{0x0903, CODA, "h"}, /* visarga */
|
||||||
|
{0x093d, CODA, "'"}, /* avagrada (') */
|
||||||
|
{0x0901, CODA, "n"}, /* candrabindu */
|
||||||
|
|
||||||
|
/* Numbers */
|
||||||
|
{0x0966, NUMBER, "0"},
|
||||||
|
{0x0967, NUMBER, "1"},
|
||||||
|
{0x0968, NUMBER, "2"},
|
||||||
|
{0x0969, NUMBER, "3"},
|
||||||
|
{0x096a, NUMBER, "4"},
|
||||||
|
{0x096b, NUMBER, "5"},
|
||||||
|
{0x096c, NUMBER, "6"},
|
||||||
|
{0x096d, NUMBER, "7"},
|
||||||
|
{0x096e, NUMBER, "8"},
|
||||||
|
{0x096f, NUMBER, "9"},
|
||||||
|
|
||||||
|
/* Diacritic modifiers */
|
||||||
|
{0x0948, VOWEL_SIGN, "ai"},
|
||||||
|
{0x094c, VOWEL_SIGN, "au"},
|
||||||
|
{0x093e, VOWEL_SIGN, "aa"},
|
||||||
|
{0x093f, VOWEL_SIGN, "i"},
|
||||||
|
{0x0940, VOWEL_SIGN, "ee"},
|
||||||
|
{0x0941, VOWEL_SIGN, "u"},
|
||||||
|
{0x0942, VOWEL_SIGN, "oo"},
|
||||||
|
{0x0943, VOWEL_SIGN, "r"},
|
||||||
|
{0x0944, VOWEL_SIGN, "rr"},
|
||||||
|
{0x0962, VOWEL_SIGN, "l"},
|
||||||
|
{0x0963, VOWEL_SIGN, "ll"},
|
||||||
|
{0x0947, VOWEL_SIGN, "e"},
|
||||||
|
{0x094b, VOWEL_SIGN, "o"},
|
||||||
|
{0x0949, VOWEL_SIGN, "o"}, /* candra o */
|
||||||
|
{0x094a, VOWEL_SIGN, "o"}, /* short o */
|
||||||
|
{0x094d, VOWEL_SIGN, ""}, /* virama */
|
||||||
|
|
||||||
|
{0x0965, CODA, "||"}, /* double danda */
|
||||||
|
{0x0964, CODA, "|"}, /* danda */
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct translit_letter *letter_by_code(unsigned int c)
|
||||||
|
{
|
||||||
|
unsigned int i;
|
||||||
|
|
||||||
|
for (i = 0; i < ARRAY_SIZE(table); i++) {
|
||||||
|
if (table[i].code == c) {
|
||||||
|
return table + i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void nukta_filter(char *latin, unsigned int *pos, unsigned int prev)
|
||||||
|
{
|
||||||
|
switch (prev) {
|
||||||
|
case 0x091c: /* z */
|
||||||
|
latin[*pos - 2] = 'z';
|
||||||
|
break;
|
||||||
|
case 0x091d: /* zh */
|
||||||
|
latin[*pos - 3] = 'z';
|
||||||
|
break;
|
||||||
|
case 0x092b:
|
||||||
|
strcpy(latin + *pos - 3, "fa");
|
||||||
|
*pos = *pos - 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int transcript_devanagari_to_hindi(const char *devanagari, char **ret)
|
||||||
|
{
|
||||||
|
struct translit_letter *letter;
|
||||||
|
unsigned int c, prev = 0, alloc = 0, done = 0, len;
|
||||||
|
const char *src = devanagari;
|
||||||
|
char *latin = NULL;
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
if (alloc < done + UNICODE_MAX_LENGTH) {
|
||||||
|
latin = realloc(latin, alloc + CHUNKSIZE);
|
||||||
|
alloc += CHUNKSIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
c = utf8_unpack_char(src);
|
||||||
|
len = utf8_char_length(c);
|
||||||
|
src += len;
|
||||||
|
|
||||||
|
if (c == ZERO_WIDTH_JOINER)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
letter = letter_by_code(c);
|
||||||
|
if (letter) {
|
||||||
|
switch (letter->type) {
|
||||||
|
case CONSONANT:
|
||||||
|
strcpy(latin + done, letter->data);
|
||||||
|
done += strlen(letter->data);
|
||||||
|
*(latin + done++) = SCHWA_CHARACTER;
|
||||||
|
break;
|
||||||
|
case VOWEL_SIGN:
|
||||||
|
if (done) {
|
||||||
|
/* delete the inherent schwa */
|
||||||
|
done--;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
strcpy(latin + done, letter->data);
|
||||||
|
done += strlen(letter->data);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (done && c == NUKTA) {
|
||||||
|
nukta_filter(latin, &done, prev);
|
||||||
|
goto next;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* remove the final schwa */
|
||||||
|
if (is_devanagari(prev) && !is_devanagari(c)) {
|
||||||
|
if (latin[done - 1] == SCHWA_CHARACTER) {
|
||||||
|
latin[--done] = '\0';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
utf8_pack_char(latin + done, c);
|
||||||
|
done += len;
|
||||||
|
}
|
||||||
|
next:
|
||||||
|
if (c == 0)
|
||||||
|
break;
|
||||||
|
prev = c;
|
||||||
|
}
|
||||||
|
|
||||||
|
*(latin + done - 1) = '\0';
|
||||||
|
|
||||||
|
*ret = latin;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
8
hindi.h
Normal file
8
hindi.h
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
|
|
||||||
|
#ifndef __HINDI_H
|
||||||
|
#define __HINDI_H
|
||||||
|
|
||||||
|
int transcript_devanagari_to_hindi(const char *devanagari, char **ret);
|
||||||
|
|
||||||
|
#endif /* __HINDI_H */
|
21
main.c
21
main.c
|
@ -1,14 +1,16 @@
|
||||||
#include "compat.h"
|
#include "compat.h"
|
||||||
#include "transliteration.h"
|
#include "transliteration.h"
|
||||||
#include "transcription.h"
|
#include "czech.h"
|
||||||
|
#include "hindi.h"
|
||||||
#include "velthuis.h"
|
#include "velthuis.h"
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
|
||||||
#define FLAG_REVERSE 1 << 0
|
#define FLAG_REVERSE 1 << 0
|
||||||
#define FLAG_VELTHUIS 1 << 1
|
#define FLAG_VELTHUIS 1 << 1
|
||||||
#define FLAG_CZECH 1 << 2
|
#define FLAG_CZECH 1 << 2
|
||||||
#define FLAG_ASCII 1 << 3
|
#define FLAG_HINDI 1 << 3
|
||||||
#define FLAG_DEVANAGARI 1 << 4
|
#define FLAG_ASCII 1 << 4
|
||||||
|
#define FLAG_DEVANAGARI 1 << 5
|
||||||
|
|
||||||
static const char *usage_str =
|
static const char *usage_str =
|
||||||
PROGNAME ", a helper for Sanskrit transliteration.\n"
|
PROGNAME ", a helper for Sanskrit transliteration.\n"
|
||||||
|
@ -23,6 +25,7 @@ static const char *usage_str =
|
||||||
" -a, --ascii convert a Devanagari text to Velthuis text rather than to IAST\n"
|
" -a, --ascii convert a Devanagari text to Velthuis text rather than to IAST\n"
|
||||||
" -d, --devanagari when encoding, output a Devanagari text rather than IAST\n"
|
" -d, --devanagari when encoding, output a Devanagari text rather than IAST\n"
|
||||||
" -c, --czech transcript Devanagari to Czech language (experimental)\n"
|
" -c, --czech transcript Devanagari to Czech language (experimental)\n"
|
||||||
|
" -H, --hindi transcript Hindi from Devanagari to Latin (experimental)\n"
|
||||||
" -h, --help show this help and exit\n"
|
" -h, --help show this help and exit\n"
|
||||||
" -v, --version show version number and exit\n"
|
" -v, --version show version number and exit\n"
|
||||||
"\n"
|
"\n"
|
||||||
|
@ -38,7 +41,7 @@ static const char *usage_str =
|
||||||
"\n"
|
"\n"
|
||||||
" For more information see the iast(1) manual page.\n";
|
" For more information see the iast(1) manual page.\n";
|
||||||
|
|
||||||
static const char *short_opts = "f:readchv";
|
static const char *short_opts = "f:readcHhv";
|
||||||
|
|
||||||
static const struct option long_opts[] = {
|
static const struct option long_opts[] = {
|
||||||
{"file", required_argument, 0, 'f'},
|
{"file", required_argument, 0, 'f'},
|
||||||
|
@ -48,6 +51,7 @@ static const struct option long_opts[] = {
|
||||||
{"ascii", no_argument, 0, 'a'},
|
{"ascii", no_argument, 0, 'a'},
|
||||||
{"devanagari", no_argument, 0, 'd'},
|
{"devanagari", no_argument, 0, 'd'},
|
||||||
{"czech", no_argument, 0, 'c'},
|
{"czech", no_argument, 0, 'c'},
|
||||||
|
{"hindi", no_argument, 0, 'H'},
|
||||||
{"help", no_argument, 0, 'h'},
|
{"help", no_argument, 0, 'h'},
|
||||||
{"version", no_argument, 0, 'v'},
|
{"version", no_argument, 0, 'v'},
|
||||||
{0, 0, 0, 0}
|
{0, 0, 0, 0}
|
||||||
|
@ -104,6 +108,9 @@ static int process_input(const char *input, char **out, unsigned int flags)
|
||||||
if (flags & FLAG_CZECH)
|
if (flags & FLAG_CZECH)
|
||||||
return transcript_devanagari_to_czech(input, out);
|
return transcript_devanagari_to_czech(input, out);
|
||||||
|
|
||||||
|
if (flags & FLAG_HINDI)
|
||||||
|
return transcript_devanagari_to_hindi(input, out);
|
||||||
|
|
||||||
if (flags & FLAG_ASCII) {
|
if (flags & FLAG_ASCII) {
|
||||||
ret = transliterate_devanagari_to_latin(input, &tmp);
|
ret = transliterate_devanagari_to_latin(input, &tmp);
|
||||||
if (ret != 0)
|
if (ret != 0)
|
||||||
|
@ -128,9 +135,6 @@ static int process_string(const char *input, unsigned int flags)
|
||||||
case 0:
|
case 0:
|
||||||
fprintf(stdout, "%s", output);
|
fprintf(stdout, "%s", output);
|
||||||
break;
|
break;
|
||||||
case EHINDI:
|
|
||||||
error("the input text is Hindi.");
|
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
error("unexpected error.");
|
error("unexpected error.");
|
||||||
break;
|
break;
|
||||||
|
@ -225,6 +229,9 @@ int main(int argc, const char **argv)
|
||||||
case 'c':
|
case 'c':
|
||||||
flags |= FLAG_CZECH;
|
flags |= FLAG_CZECH;
|
||||||
break;
|
break;
|
||||||
|
case 'H':
|
||||||
|
flags |= FLAG_HINDI;
|
||||||
|
break;
|
||||||
case 'h':
|
case 'h':
|
||||||
print_usage();
|
print_usage();
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
#include "test.h"
|
#include "test.h"
|
||||||
#include "transcript.h"
|
#include "czech.h"
|
||||||
#include "../compat.h"
|
#include "../compat.h"
|
||||||
#include "../transcription.h"
|
#include "../czech.h"
|
||||||
|
|
||||||
static void test_transcript(const char *devanagari, const char *latin)
|
static void test_transcript(const char *devanagari, const char *latin)
|
||||||
{
|
{
|
||||||
|
@ -34,18 +34,7 @@ START_TEST(test_transcript_devanagari_to_czech)
|
||||||
}
|
}
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
START_TEST(test_transcript_detect_hindi)
|
void register_transcript_czech_tests(TCase *test_case)
|
||||||
{
|
|
||||||
char *czech;
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
ret = transcript_devanagari_to_czech("लड़की", &czech);
|
|
||||||
ck_assert_int_eq(EHINDI, ret);
|
|
||||||
}
|
|
||||||
END_TEST
|
|
||||||
|
|
||||||
void register_transcript_tests(TCase *test_case)
|
|
||||||
{
|
{
|
||||||
tcase_add_test(test_case, test_transcript_devanagari_to_czech);
|
tcase_add_test(test_case, test_transcript_devanagari_to_czech);
|
||||||
tcase_add_test(test_case, test_transcript_detect_hindi);
|
|
||||||
}
|
}
|
8
tests/czech.h
Normal file
8
tests/czech.h
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
#ifndef __TEST_CZECH_H
|
||||||
|
#define __TEST_CZECH_H
|
||||||
|
|
||||||
|
#include <check.h>
|
||||||
|
|
||||||
|
void register_transcript_czech_tests(TCase *test_case);
|
||||||
|
|
||||||
|
#endif /* __TEST_CZECH_H */
|
30
tests/hindi.c
Normal file
30
tests/hindi.c
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
#include "test.h"
|
||||||
|
#include "hindi.h"
|
||||||
|
#include "../compat.h"
|
||||||
|
#include "../hindi.h"
|
||||||
|
|
||||||
|
static void test_transcript(const char *devanagari, const char *latin)
|
||||||
|
{
|
||||||
|
char *hindi;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = transcript_devanagari_to_hindi(devanagari, &hindi);
|
||||||
|
ck_assert_int_eq(0, ret);
|
||||||
|
ck_assert_str_eq(latin, hindi);
|
||||||
|
free(hindi);
|
||||||
|
}
|
||||||
|
|
||||||
|
START_TEST(test_transcript_devanagari_to_hindi)
|
||||||
|
{
|
||||||
|
test_transcript("क़ ख़ ग़ ज़ झ़ ड़ ढ़ फ़", "k kh g z zh d dh f"); /* composite */
|
||||||
|
test_transcript("क़ ख़ ग़ ज़ ड़ ढ़ फ़", "k kh g z d dh f"); /* independent */
|
||||||
|
test_transcript("कभी ख़ुशी कभी ग़म", "kabhee khushee kabhee gam");
|
||||||
|
test_transcript("मैं एक लड़का हूँ और तुम एक लड़की हो", "main ek ladaka hoon aur tum ek ladakee ho");
|
||||||
|
test_transcript("स्कॉट्लैण्ड ऑरेंज", "skotlaind orenj"); /* o */
|
||||||
|
}
|
||||||
|
END_TEST
|
||||||
|
|
||||||
|
void register_transcript_hindi_tests(TCase *test_case)
|
||||||
|
{
|
||||||
|
tcase_add_test(test_case, test_transcript_devanagari_to_hindi);
|
||||||
|
}
|
8
tests/hindi.h
Normal file
8
tests/hindi.h
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
#ifndef __TEST_HINDI_H
|
||||||
|
#define __TEST_HINDI_H
|
||||||
|
|
||||||
|
#include <check.h>
|
||||||
|
|
||||||
|
void register_transcript_hindi_tests(TCase *test_case);
|
||||||
|
|
||||||
|
#endif /* __TEST_HINDI_H */
|
|
@ -88,13 +88,20 @@ START_TEST(test_transliterate_arguments)
|
||||||
}
|
}
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
START_TEST(test_transcript)
|
START_TEST(test_transcript_czech)
|
||||||
{
|
{
|
||||||
test_output("./iast -c \"भगवद्गीता\"", "bhagavadgíta\n");
|
test_output("./iast -c \"भगवद्गीता\"", "bhagavadgíta\n");
|
||||||
test_output("./iast --czech \"तन्त्रशास्त्रम्\"", "tantrašástra\n");
|
test_output("./iast --czech \"तन्त्रशास्त्रम्\"", "tantrašástra\n");
|
||||||
}
|
}
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
|
START_TEST(test_transcript_hindi)
|
||||||
|
{
|
||||||
|
test_output("./iast -H \"हिन्दी\"", "hindee\n");
|
||||||
|
test_output("./iast --hindi \"लड़की\"", "ladakee\n");
|
||||||
|
}
|
||||||
|
END_TEST
|
||||||
|
|
||||||
START_TEST(test_velthuis)
|
START_TEST(test_velthuis)
|
||||||
{
|
{
|
||||||
test_output("./iast \".rta.m ca satyam\" -e", "ṛtaṃ ca satyam\n");
|
test_output("./iast \".rta.m ca satyam\" -e", "ṛtaṃ ca satyam\n");
|
||||||
|
@ -141,7 +148,6 @@ START_TEST(test_errors)
|
||||||
{
|
{
|
||||||
test_output("./iast -x 2>&1", "[iast] error: unrecognised option '-x'.\n");
|
test_output("./iast -x 2>&1", "[iast] error: unrecognised option '-x'.\n");
|
||||||
test_output("./iast -f xxx 2>&1", "[iast] error: failed to read file 'xxx'.\n");
|
test_output("./iast -f xxx 2>&1", "[iast] error: failed to read file 'xxx'.\n");
|
||||||
test_output("./iast \u0921\u093c 2>&1", "[iast] error: the input text is Hindi.\n");
|
|
||||||
}
|
}
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
|
@ -149,7 +155,8 @@ void register_integration_tests(TCase *test_case)
|
||||||
{
|
{
|
||||||
tcase_add_test(test_case, test_transliterate_files);
|
tcase_add_test(test_case, test_transliterate_files);
|
||||||
tcase_add_test(test_case, test_transliterate_arguments);
|
tcase_add_test(test_case, test_transliterate_arguments);
|
||||||
tcase_add_test(test_case, test_transcript);
|
tcase_add_test(test_case, test_transcript_czech);
|
||||||
|
tcase_add_test(test_case, test_transcript_hindi);
|
||||||
tcase_add_test(test_case, test_velthuis);
|
tcase_add_test(test_case, test_velthuis);
|
||||||
tcase_add_test(test_case, test_ascii);
|
tcase_add_test(test_case, test_ascii);
|
||||||
tcase_add_test(test_case, test_version);
|
tcase_add_test(test_case, test_version);
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#include "test.h"
|
#include "test.h"
|
||||||
#include "translit.h"
|
#include "translit.h"
|
||||||
#include "transcript.h"
|
#include "czech.h"
|
||||||
|
#include "hindi.h"
|
||||||
#include "velthuis.h"
|
#include "velthuis.h"
|
||||||
#include "utf8.h"
|
#include "utf8.h"
|
||||||
#include "integration.h"
|
#include "integration.h"
|
||||||
|
@ -14,7 +15,8 @@ static Suite *create_test_suite()
|
||||||
test_case = tcase_create(NULL);
|
test_case = tcase_create(NULL);
|
||||||
|
|
||||||
register_translit_tests(test_case);
|
register_translit_tests(test_case);
|
||||||
register_transcript_tests(test_case);
|
register_transcript_czech_tests(test_case);
|
||||||
|
register_transcript_hindi_tests(test_case);
|
||||||
register_velthuis_encoder_tests(test_case);
|
register_velthuis_encoder_tests(test_case);
|
||||||
register_utf8_tests(test_case);
|
register_utf8_tests(test_case);
|
||||||
|
|
||||||
|
|
|
@ -1,8 +0,0 @@
|
||||||
#ifndef __TEST_TRANSCRIPT_H
|
|
||||||
#define __TEST_TRANSCRIPT_H
|
|
||||||
|
|
||||||
#include <check.h>
|
|
||||||
|
|
||||||
void register_transcript_tests(TCase *test_case);
|
|
||||||
|
|
||||||
#endif /* __TEST_TRANSCRIPT_H */
|
|
|
@ -84,14 +84,6 @@ START_TEST(test_translit_zero_width_joiner)
|
||||||
}
|
}
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
START_TEST(test_translit_detect_hindi)
|
|
||||||
{
|
|
||||||
char *hindi = NULL;
|
|
||||||
int ret = transliterate_devanagari_to_latin("लड़की", &hindi);
|
|
||||||
ck_assert_int_eq(EHINDI, ret);
|
|
||||||
}
|
|
||||||
END_TEST
|
|
||||||
|
|
||||||
void register_translit_tests(TCase *test_case)
|
void register_translit_tests(TCase *test_case)
|
||||||
{
|
{
|
||||||
tcase_add_test(test_case, test_translit_devanagari_to_latin);
|
tcase_add_test(test_case, test_translit_devanagari_to_latin);
|
||||||
|
@ -99,5 +91,4 @@ void register_translit_tests(TCase *test_case)
|
||||||
tcase_add_test(test_case, test_translit_lla_sylable);
|
tcase_add_test(test_case, test_translit_lla_sylable);
|
||||||
tcase_add_test(test_case, test_translit_candrabindu);
|
tcase_add_test(test_case, test_translit_candrabindu);
|
||||||
tcase_add_test(test_case, test_translit_zero_width_joiner);
|
tcase_add_test(test_case, test_translit_zero_width_joiner);
|
||||||
tcase_add_test(test_case, test_translit_detect_hindi);
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,7 +8,6 @@
|
||||||
#define SCHWA_CHARACTER 'a'
|
#define SCHWA_CHARACTER 'a'
|
||||||
#define ZERO_WIDTH_JOINER 0x200d
|
#define ZERO_WIDTH_JOINER 0x200d
|
||||||
#define VIRAMA 0x094d
|
#define VIRAMA 0x094d
|
||||||
#define NUKTA 0x093c
|
|
||||||
#define CHUNKSIZE 1024
|
#define CHUNKSIZE 1024
|
||||||
|
|
||||||
static struct translit_letter table[] = {
|
static struct translit_letter table[] = {
|
||||||
|
@ -136,11 +135,6 @@ int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
|
||||||
len = utf8_char_length(c);
|
len = utf8_char_length(c);
|
||||||
src += len;
|
src += len;
|
||||||
|
|
||||||
if (c == NUKTA) {
|
|
||||||
*ret = NULL;
|
|
||||||
return EHINDI;
|
|
||||||
}
|
|
||||||
|
|
||||||
letter = letter_by_code(c);
|
letter = letter_by_code(c);
|
||||||
if (letter) {
|
if (letter) {
|
||||||
switch (letter->type) {
|
switch (letter->type) {
|
||||||
|
|
Loading…
Reference in a new issue