diff --git a/compat.h b/compat.h index 4ca046c..78cf55e 100644 --- a/compat.h +++ b/compat.h @@ -10,4 +10,8 @@ #include #include +enum err { + EHINDI = 1 +}; + #endif /* __COMPAT_H */ diff --git a/main.c b/main.c index 0d5414b..f94c64f 100644 --- a/main.c +++ b/main.c @@ -68,18 +68,42 @@ static void error(const char *msg, ...) va_end(params); } -static char *process_input(const char *input, unsigned int flags) +static int process_input(const char *input, char **out, unsigned int flags) { if (flags & FLAG_REVERSE) - return transliterate_latin_to_devanagari(input); + return transliterate_latin_to_devanagari(input, out); if (flags & FLAG_VELTHUIS) - return encode_velthuis_to_iast_punctation(input); + return encode_velthuis_to_iast_punctation(input, out); if (flags & FLAG_CZECH) - return transcript_devanagari_to_czech(input); + return transcript_devanagari_to_czech(input, out); - return transliterate_devanagari_to_latin(input); + return transliterate_devanagari_to_latin(input, out); +} + +static int process_string(const char *input, unsigned int flags) +{ + char *output; + int ret; + + ret = process_input(input, &output, flags); + + switch (ret) { + case 0: + fprintf(stdout, "%s", output); + break; + case EHINDI: + error("the input text is Hindi"); + break; + default: + error("unexpected error"); + break; + } + + free(output); + + return ret; } #define CHUNKSIZE 1024 @@ -135,7 +159,7 @@ int main(int argc, const char **argv) const char *files[argc]; unsigned int nfiles = 0; unsigned int flags = 0; - char *input, *output; + char *input; if (argc == 1) { print_usage(); @@ -178,10 +202,11 @@ int main(int argc, const char **argv) while (optind < argc) { const char *arg = argv[optind++]; - output = process_input(arg, flags); - fprintf(stdout, "%s\n", output); - free(output); - } + retval = process_string(arg, flags); + if (retval != 0) + return retval; + putchar('\n'); + } for (i = 0; i < nfiles; i++) { if (strcmp(files[i], "-") == 0) { @@ -194,11 +219,10 @@ int main(int argc, const char **argv) error("failed to read file '%s'.", files[i]); return retval; } - - output = process_input(input, flags); - fprintf(stdout, "%s", output); - free(output); + retval = process_string(input, flags); free(input); + if (retval != 0) + return retval; } return 0; diff --git a/tests/transcript.c b/tests/transcript.c index b8a1e3f..c62370d 100644 --- a/tests/transcript.c +++ b/tests/transcript.c @@ -1,10 +1,15 @@ #include "test.h" #include "transcript.h" +#include "../compat.h" #include "../transcription.h" static void test_transcript(const char *devanagari, const char *latin) { - char *czech = transcript_devanagari_to_czech(devanagari); + char *czech; + int ret; + + ret = transcript_devanagari_to_czech(devanagari, &czech); + ck_assert_int_eq(0, ret); ck_assert_str_eq(latin, czech); free(czech); } @@ -27,7 +32,18 @@ START_TEST(test_transcript_devanagari_to_czech) } END_TEST +START_TEST(test_transcript_detect_hindi) +{ + char *czech; + int ret; + + ret = transcript_devanagari_to_czech("लड़की", &czech); + ck_assert_int_eq(EHINDI, ret); +} +END_TEST + void register_transcript_tests(TCase *test_case) { tcase_add_test(test_case, test_transcript_devanagari_to_czech); + tcase_add_test(test_case, test_transcript_detect_hindi); } diff --git a/tests/translit.c b/tests/translit.c index 3f734ea..35c9720 100644 --- a/tests/translit.c +++ b/tests/translit.c @@ -1,11 +1,16 @@ #include "test.h" #include "translit.h" +#include "../compat.h" #include "../transliteration.h" static void test_transliterate_devanagari_to_latin(const char *devanagari, const char *latin) { - char *str = transliterate_devanagari_to_latin(devanagari); + char *str; + int ret; + + ret = transliterate_devanagari_to_latin(devanagari, &str); + ck_assert_int_eq(0, ret); ck_assert_str_eq(latin, str); free(str); } @@ -13,7 +18,10 @@ static void test_transliterate_devanagari_to_latin(const char *devanagari, static void test_transliterate_latin_to_devanagari(const char *latin, const char *devanagari) { - char *str = transliterate_latin_to_devanagari(latin); + char *str; + int ret; + + ret = transliterate_latin_to_devanagari(latin, &str); ck_assert_str_eq(devanagari, str); free(str); } @@ -65,10 +73,19 @@ START_TEST(test_translit_candrabindu) } END_TEST +START_TEST(test_translit_detect_hindi) +{ + char *hindi = NULL; + int ret = transliterate_devanagari_to_latin("लड़की", &hindi); + ck_assert_int_eq(EHINDI, ret); +} +END_TEST + void register_translit_tests(TCase *test_case) { tcase_add_test(test_case, test_translit_devanagari_to_latin); tcase_add_test(test_case, test_translit_latin_to_devanagari); tcase_add_test(test_case, test_translit_lla_sylable); tcase_add_test(test_case, test_translit_candrabindu); + tcase_add_test(test_case, test_translit_detect_hindi); } diff --git a/tests/velthuis.c b/tests/velthuis.c index 6c0c037..138afcf 100644 --- a/tests/velthuis.c +++ b/tests/velthuis.c @@ -4,7 +4,8 @@ static void test_encoding(const char *in, const char *expected) { - char *iast = encode_velthuis_to_iast_punctation(in); + char *iast; + encode_velthuis_to_iast_punctation(in, &iast); ck_assert_str_eq(expected, iast); free(iast); } diff --git a/transcription.c b/transcription.c index 4a22e2d..8af00d5 100644 --- a/transcription.c +++ b/transcription.c @@ -6,8 +6,10 @@ #include "iast-czech.h" #include "utf8.h" -#define CHUNKSIZE 1024 #define SCHWA_CHARACTER 'a' +#define VIRAMA 0x094d +#define NUKTA 0x093c +#define CHUNKSIZE 1024 static inline int is_consonant(unsigned int c) { @@ -58,7 +60,7 @@ static void end_of_word_filter(char *latin, unsigned int *pos, /* remove singular nominative suffix */ len = utf8_char_length(c); - if (prev == 0x094d && *(latin + *pos - 1 - len) == 'm') { + if (prev == VIRAMA && *(latin + *pos - 1 - len) == 'm') { memmove(latin + *pos - 1 - len, latin + *pos - len, c); *pos = *pos - 1; } @@ -77,7 +79,7 @@ static struct translit_letter *letter_by_code(struct translit_letter *table, return NULL; } -char *transcript_devanagari_to_czech(const char *devanagari) +int transcript_devanagari_to_czech(const char *devanagari, char **ret) { struct translit_letter *table, *letter; unsigned int c, prev = 0, alloc = 0, done = 0, len; @@ -98,6 +100,11 @@ char *transcript_devanagari_to_czech(const char *devanagari) nasal_consonants_filter(latin, &done, prev, c); + if (c == NUKTA) { + *ret = NULL; + return EHINDI; + } + letter = letter_by_code(table, c); if (letter) { switch (letter->type) { @@ -107,11 +114,10 @@ char *transcript_devanagari_to_czech(const char *devanagari) *(latin + done++) = SCHWA_CHARACTER; break; case VOWEL_SIGN: - if (done) + if (done) { + /* delete the inherent schwa */ done--; - strcpy(latin + done, letter->data); - done += strlen(letter->data); - break; + } default: strcpy(latin + done, letter->data); done += strlen(letter->data); @@ -132,5 +138,7 @@ char *transcript_devanagari_to_czech(const char *devanagari) *(latin + done - 1) = '\0'; - return latin; + *ret = latin; + + return 0; } diff --git a/transcription.h b/transcription.h index 7b40dd8..11ccbdb 100644 --- a/transcription.h +++ b/transcription.h @@ -3,6 +3,6 @@ #ifndef __TRANSCRIPTION_H #define __TRANSCRIPTION_H -char *transcript_devanagari_to_czech(const char *devanagari); +int transcript_devanagari_to_czech(const char *devanagari, char **ret); #endif /* __TRANSCRIPTION_H */ diff --git a/transliteration.c b/transliteration.c index d5a0e35..7dda25e 100644 --- a/transliteration.c +++ b/transliteration.c @@ -6,7 +6,8 @@ #include "utf8.h" #define SCHWA_CHARACTER 'a' -#define VIRAMA 0x094d +#define VIRAMA 0x094d +#define NUKTA 0x093c #define CHUNKSIZE 1024 static struct translit_letter *letter_by_code(struct translit_letter *table, @@ -21,7 +22,7 @@ static struct translit_letter *letter_by_code(struct translit_letter *table, return NULL; } -char *transliterate_devanagari_to_latin(const char *devanagari) +int transliterate_devanagari_to_latin(const char *devanagari, char **ret) { struct translit_letter *table, *letter; unsigned int c, alloc = 0, done = 0, len; @@ -40,6 +41,11 @@ char *transliterate_devanagari_to_latin(const char *devanagari) len = utf8_char_length(c); src += len; + if (c == NUKTA) { + *ret = NULL; + return EHINDI; + } + letter = letter_by_code(table, c); if (letter) { switch (letter->type) { @@ -49,11 +55,10 @@ char *transliterate_devanagari_to_latin(const char *devanagari) *(latin + done++) = SCHWA_CHARACTER; break; case VOWEL_SIGN: - if (done) + if (done) { + /* delete the inherent schwa */ done--; - strcpy(latin + done, letter->data); - done += strlen(letter->data); - break; + } default: strcpy(latin + done, letter->data); done += strlen(letter->data); @@ -68,7 +73,9 @@ char *transliterate_devanagari_to_latin(const char *devanagari) break; } - return latin; + *ret = latin; + + return 0; } static struct translit_letter *letter_by_data(struct translit_letter *table, @@ -98,7 +105,7 @@ static struct translit_letter *vowel_sign_by_data(struct translit_letter *table, return NULL; } -char *transliterate_latin_to_devanagari(const char *latin) +int transliterate_latin_to_devanagari(const char *latin, char **ret) { struct translit_letter *table, *letter, *next; unsigned int alloc = 0, done = 0, len; @@ -165,5 +172,7 @@ encode_vowel_modifier: if (devanagari) devanagari[done] = '\0'; - return devanagari; + *ret = devanagari; + + return 0; } diff --git a/transliteration.h b/transliteration.h index cbdd17d..aea9cbb 100644 --- a/transliteration.h +++ b/transliteration.h @@ -22,8 +22,8 @@ struct translit_context { struct translit_letter *table; }; -char *transliterate_devanagari_to_latin(const char *text); -char *transliterate_latin_to_devanagari(const char *text); +int transliterate_devanagari_to_latin(const char *text, char **ret); +int transliterate_latin_to_devanagari(const char *text, char **ret); static inline int is_devanagari(unsigned int code) { diff --git a/velthuis.c b/velthuis.c index 3e12037..bb727db 100644 --- a/velthuis.c +++ b/velthuis.c @@ -55,7 +55,7 @@ static const struct encoder_tuple *find_tuple(const char *text) return NULL; } -char *encode_velthuis_to_iast_punctation(const char *text) +int encode_velthuis_to_iast_punctation(const char *text, char **out) { const char *str = text, *end = str + strlen(str); const struct encoder_tuple *tuple; @@ -79,5 +79,7 @@ char *encode_velthuis_to_iast_punctation(const char *text) } } - return buf; + *out = buf; + + return 0; } diff --git a/velthuis.h b/velthuis.h index f10c0cd..314b99e 100644 --- a/velthuis.h +++ b/velthuis.h @@ -3,6 +3,6 @@ #ifndef __VELTHUIS_H #define __VELTHUIS_H -char *encode_velthuis_to_iast_punctation(const char *text); +int encode_velthuis_to_iast_punctation(const char *text, char **out); #endif /* __VELTHUIS_H */