refuse to transliterate Hindi-specific characters

This commit is contained in:
Vlasta Vesely 2021-03-12 18:51:09 +01:00
parent 8db12fd79e
commit 739c7ea462
11 changed files with 122 additions and 41 deletions

View file

@ -10,4 +10,8 @@
#include <getopt.h> #include <getopt.h>
#include <errno.h> #include <errno.h>
enum err {
EHINDI = 1
};
#endif /* __COMPAT_H */ #endif /* __COMPAT_H */

52
main.c
View file

@ -68,18 +68,42 @@ static void error(const char *msg, ...)
va_end(params); va_end(params);
} }
static char *process_input(const char *input, unsigned int flags) static int process_input(const char *input, char **out, unsigned int flags)
{ {
if (flags & FLAG_REVERSE) if (flags & FLAG_REVERSE)
return transliterate_latin_to_devanagari(input); return transliterate_latin_to_devanagari(input, out);
if (flags & FLAG_VELTHUIS) if (flags & FLAG_VELTHUIS)
return encode_velthuis_to_iast_punctation(input); return encode_velthuis_to_iast_punctation(input, out);
if (flags & FLAG_CZECH) if (flags & FLAG_CZECH)
return transcript_devanagari_to_czech(input); return transcript_devanagari_to_czech(input, out);
return transliterate_devanagari_to_latin(input); return transliterate_devanagari_to_latin(input, out);
}
static int process_string(const char *input, unsigned int flags)
{
char *output;
int ret;
ret = process_input(input, &output, flags);
switch (ret) {
case 0:
fprintf(stdout, "%s", output);
break;
case EHINDI:
error("the input text is Hindi");
break;
default:
error("unexpected error");
break;
}
free(output);
return ret;
} }
#define CHUNKSIZE 1024 #define CHUNKSIZE 1024
@ -135,7 +159,7 @@ int main(int argc, const char **argv)
const char *files[argc]; const char *files[argc];
unsigned int nfiles = 0; unsigned int nfiles = 0;
unsigned int flags = 0; unsigned int flags = 0;
char *input, *output; char *input;
if (argc == 1) { if (argc == 1) {
print_usage(); print_usage();
@ -178,10 +202,11 @@ int main(int argc, const char **argv)
while (optind < argc) { while (optind < argc) {
const char *arg = argv[optind++]; const char *arg = argv[optind++];
output = process_input(arg, flags); retval = process_string(arg, flags);
fprintf(stdout, "%s\n", output); if (retval != 0)
free(output); return retval;
} putchar('\n');
}
for (i = 0; i < nfiles; i++) { for (i = 0; i < nfiles; i++) {
if (strcmp(files[i], "-") == 0) { if (strcmp(files[i], "-") == 0) {
@ -194,11 +219,10 @@ int main(int argc, const char **argv)
error("failed to read file '%s'.", files[i]); error("failed to read file '%s'.", files[i]);
return retval; return retval;
} }
retval = process_string(input, flags);
output = process_input(input, flags);
fprintf(stdout, "%s", output);
free(output);
free(input); free(input);
if (retval != 0)
return retval;
} }
return 0; return 0;

View file

@ -1,10 +1,15 @@
#include "test.h" #include "test.h"
#include "transcript.h" #include "transcript.h"
#include "../compat.h"
#include "../transcription.h" #include "../transcription.h"
static void test_transcript(const char *devanagari, const char *latin) static void test_transcript(const char *devanagari, const char *latin)
{ {
char *czech = transcript_devanagari_to_czech(devanagari); char *czech;
int ret;
ret = transcript_devanagari_to_czech(devanagari, &czech);
ck_assert_int_eq(0, ret);
ck_assert_str_eq(latin, czech); ck_assert_str_eq(latin, czech);
free(czech); free(czech);
} }
@ -27,7 +32,18 @@ START_TEST(test_transcript_devanagari_to_czech)
} }
END_TEST END_TEST
START_TEST(test_transcript_detect_hindi)
{
char *czech;
int ret;
ret = transcript_devanagari_to_czech("लड़की", &czech);
ck_assert_int_eq(EHINDI, ret);
}
END_TEST
void register_transcript_tests(TCase *test_case) void register_transcript_tests(TCase *test_case)
{ {
tcase_add_test(test_case, test_transcript_devanagari_to_czech); tcase_add_test(test_case, test_transcript_devanagari_to_czech);
tcase_add_test(test_case, test_transcript_detect_hindi);
} }

View file

@ -1,11 +1,16 @@
#include "test.h" #include "test.h"
#include "translit.h" #include "translit.h"
#include "../compat.h"
#include "../transliteration.h" #include "../transliteration.h"
static void test_transliterate_devanagari_to_latin(const char *devanagari, static void test_transliterate_devanagari_to_latin(const char *devanagari,
const char *latin) const char *latin)
{ {
char *str = transliterate_devanagari_to_latin(devanagari); char *str;
int ret;
ret = transliterate_devanagari_to_latin(devanagari, &str);
ck_assert_int_eq(0, ret);
ck_assert_str_eq(latin, str); ck_assert_str_eq(latin, str);
free(str); free(str);
} }
@ -13,7 +18,10 @@ static void test_transliterate_devanagari_to_latin(const char *devanagari,
static void test_transliterate_latin_to_devanagari(const char *latin, static void test_transliterate_latin_to_devanagari(const char *latin,
const char *devanagari) const char *devanagari)
{ {
char *str = transliterate_latin_to_devanagari(latin); char *str;
int ret;
ret = transliterate_latin_to_devanagari(latin, &str);
ck_assert_str_eq(devanagari, str); ck_assert_str_eq(devanagari, str);
free(str); free(str);
} }
@ -65,10 +73,19 @@ START_TEST(test_translit_candrabindu)
} }
END_TEST END_TEST
START_TEST(test_translit_detect_hindi)
{
char *hindi = NULL;
int ret = transliterate_devanagari_to_latin("लड़की", &hindi);
ck_assert_int_eq(EHINDI, ret);
}
END_TEST
void register_translit_tests(TCase *test_case) void register_translit_tests(TCase *test_case)
{ {
tcase_add_test(test_case, test_translit_devanagari_to_latin); tcase_add_test(test_case, test_translit_devanagari_to_latin);
tcase_add_test(test_case, test_translit_latin_to_devanagari); tcase_add_test(test_case, test_translit_latin_to_devanagari);
tcase_add_test(test_case, test_translit_lla_sylable); tcase_add_test(test_case, test_translit_lla_sylable);
tcase_add_test(test_case, test_translit_candrabindu); tcase_add_test(test_case, test_translit_candrabindu);
tcase_add_test(test_case, test_translit_detect_hindi);
} }

View file

@ -4,7 +4,8 @@
static void test_encoding(const char *in, const char *expected) static void test_encoding(const char *in, const char *expected)
{ {
char *iast = encode_velthuis_to_iast_punctation(in); char *iast;
encode_velthuis_to_iast_punctation(in, &iast);
ck_assert_str_eq(expected, iast); ck_assert_str_eq(expected, iast);
free(iast); free(iast);
} }

View file

@ -6,8 +6,10 @@
#include "iast-czech.h" #include "iast-czech.h"
#include "utf8.h" #include "utf8.h"
#define CHUNKSIZE 1024
#define SCHWA_CHARACTER 'a' #define SCHWA_CHARACTER 'a'
#define VIRAMA 0x094d
#define NUKTA 0x093c
#define CHUNKSIZE 1024
static inline int is_consonant(unsigned int c) static inline int is_consonant(unsigned int c)
{ {
@ -58,7 +60,7 @@ static void end_of_word_filter(char *latin, unsigned int *pos,
/* remove singular nominative suffix */ /* remove singular nominative suffix */
len = utf8_char_length(c); len = utf8_char_length(c);
if (prev == 0x094d && *(latin + *pos - 1 - len) == 'm') { if (prev == VIRAMA && *(latin + *pos - 1 - len) == 'm') {
memmove(latin + *pos - 1 - len, latin + *pos - len, c); memmove(latin + *pos - 1 - len, latin + *pos - len, c);
*pos = *pos - 1; *pos = *pos - 1;
} }
@ -77,7 +79,7 @@ static struct translit_letter *letter_by_code(struct translit_letter *table,
return NULL; return NULL;
} }
char *transcript_devanagari_to_czech(const char *devanagari) int transcript_devanagari_to_czech(const char *devanagari, char **ret)
{ {
struct translit_letter *table, *letter; struct translit_letter *table, *letter;
unsigned int c, prev = 0, alloc = 0, done = 0, len; unsigned int c, prev = 0, alloc = 0, done = 0, len;
@ -98,6 +100,11 @@ char *transcript_devanagari_to_czech(const char *devanagari)
nasal_consonants_filter(latin, &done, prev, c); nasal_consonants_filter(latin, &done, prev, c);
if (c == NUKTA) {
*ret = NULL;
return EHINDI;
}
letter = letter_by_code(table, c); letter = letter_by_code(table, c);
if (letter) { if (letter) {
switch (letter->type) { switch (letter->type) {
@ -107,11 +114,10 @@ char *transcript_devanagari_to_czech(const char *devanagari)
*(latin + done++) = SCHWA_CHARACTER; *(latin + done++) = SCHWA_CHARACTER;
break; break;
case VOWEL_SIGN: case VOWEL_SIGN:
if (done) if (done) {
/* delete the inherent schwa */
done--; done--;
strcpy(latin + done, letter->data); }
done += strlen(letter->data);
break;
default: default:
strcpy(latin + done, letter->data); strcpy(latin + done, letter->data);
done += strlen(letter->data); done += strlen(letter->data);
@ -132,5 +138,7 @@ char *transcript_devanagari_to_czech(const char *devanagari)
*(latin + done - 1) = '\0'; *(latin + done - 1) = '\0';
return latin; *ret = latin;
return 0;
} }

View file

@ -3,6 +3,6 @@
#ifndef __TRANSCRIPTION_H #ifndef __TRANSCRIPTION_H
#define __TRANSCRIPTION_H #define __TRANSCRIPTION_H
char *transcript_devanagari_to_czech(const char *devanagari); int transcript_devanagari_to_czech(const char *devanagari, char **ret);
#endif /* __TRANSCRIPTION_H */ #endif /* __TRANSCRIPTION_H */

View file

@ -6,7 +6,8 @@
#include "utf8.h" #include "utf8.h"
#define SCHWA_CHARACTER 'a' #define SCHWA_CHARACTER 'a'
#define VIRAMA 0x094d #define VIRAMA 0x094d
#define NUKTA 0x093c
#define CHUNKSIZE 1024 #define CHUNKSIZE 1024
static struct translit_letter *letter_by_code(struct translit_letter *table, static struct translit_letter *letter_by_code(struct translit_letter *table,
@ -21,7 +22,7 @@ static struct translit_letter *letter_by_code(struct translit_letter *table,
return NULL; return NULL;
} }
char *transliterate_devanagari_to_latin(const char *devanagari) int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
{ {
struct translit_letter *table, *letter; struct translit_letter *table, *letter;
unsigned int c, alloc = 0, done = 0, len; unsigned int c, alloc = 0, done = 0, len;
@ -40,6 +41,11 @@ char *transliterate_devanagari_to_latin(const char *devanagari)
len = utf8_char_length(c); len = utf8_char_length(c);
src += len; src += len;
if (c == NUKTA) {
*ret = NULL;
return EHINDI;
}
letter = letter_by_code(table, c); letter = letter_by_code(table, c);
if (letter) { if (letter) {
switch (letter->type) { switch (letter->type) {
@ -49,11 +55,10 @@ char *transliterate_devanagari_to_latin(const char *devanagari)
*(latin + done++) = SCHWA_CHARACTER; *(latin + done++) = SCHWA_CHARACTER;
break; break;
case VOWEL_SIGN: case VOWEL_SIGN:
if (done) if (done) {
/* delete the inherent schwa */
done--; done--;
strcpy(latin + done, letter->data); }
done += strlen(letter->data);
break;
default: default:
strcpy(latin + done, letter->data); strcpy(latin + done, letter->data);
done += strlen(letter->data); done += strlen(letter->data);
@ -68,7 +73,9 @@ char *transliterate_devanagari_to_latin(const char *devanagari)
break; break;
} }
return latin; *ret = latin;
return 0;
} }
static struct translit_letter *letter_by_data(struct translit_letter *table, static struct translit_letter *letter_by_data(struct translit_letter *table,
@ -98,7 +105,7 @@ static struct translit_letter *vowel_sign_by_data(struct translit_letter *table,
return NULL; return NULL;
} }
char *transliterate_latin_to_devanagari(const char *latin) int transliterate_latin_to_devanagari(const char *latin, char **ret)
{ {
struct translit_letter *table, *letter, *next; struct translit_letter *table, *letter, *next;
unsigned int alloc = 0, done = 0, len; unsigned int alloc = 0, done = 0, len;
@ -165,5 +172,7 @@ encode_vowel_modifier:
if (devanagari) if (devanagari)
devanagari[done] = '\0'; devanagari[done] = '\0';
return devanagari; *ret = devanagari;
return 0;
} }

View file

@ -22,8 +22,8 @@ struct translit_context {
struct translit_letter *table; struct translit_letter *table;
}; };
char *transliterate_devanagari_to_latin(const char *text); int transliterate_devanagari_to_latin(const char *text, char **ret);
char *transliterate_latin_to_devanagari(const char *text); int transliterate_latin_to_devanagari(const char *text, char **ret);
static inline int is_devanagari(unsigned int code) static inline int is_devanagari(unsigned int code)
{ {

View file

@ -55,7 +55,7 @@ static const struct encoder_tuple *find_tuple(const char *text)
return NULL; return NULL;
} }
char *encode_velthuis_to_iast_punctation(const char *text) int encode_velthuis_to_iast_punctation(const char *text, char **out)
{ {
const char *str = text, *end = str + strlen(str); const char *str = text, *end = str + strlen(str);
const struct encoder_tuple *tuple; const struct encoder_tuple *tuple;
@ -79,5 +79,7 @@ char *encode_velthuis_to_iast_punctation(const char *text)
} }
} }
return buf; *out = buf;
return 0;
} }

View file

@ -3,6 +3,6 @@
#ifndef __VELTHUIS_H #ifndef __VELTHUIS_H
#define __VELTHUIS_H #define __VELTHUIS_H
char *encode_velthuis_to_iast_punctation(const char *text); int encode_velthuis_to_iast_punctation(const char *text, char **out);
#endif /* __VELTHUIS_H */ #endif /* __VELTHUIS_H */