refuse to transliterate Hindi-specific characters

This commit is contained in:
Vlasta Vesely 2021-03-12 18:51:09 +01:00
parent 8db12fd79e
commit 739c7ea462
11 changed files with 122 additions and 41 deletions

View file

@ -10,4 +10,8 @@
#include <getopt.h>
#include <errno.h>
enum err {
EHINDI = 1
};
#endif /* __COMPAT_H */

50
main.c
View file

@ -68,18 +68,42 @@ static void error(const char *msg, ...)
va_end(params);
}
static char *process_input(const char *input, unsigned int flags)
static int process_input(const char *input, char **out, unsigned int flags)
{
if (flags & FLAG_REVERSE)
return transliterate_latin_to_devanagari(input);
return transliterate_latin_to_devanagari(input, out);
if (flags & FLAG_VELTHUIS)
return encode_velthuis_to_iast_punctation(input);
return encode_velthuis_to_iast_punctation(input, out);
if (flags & FLAG_CZECH)
return transcript_devanagari_to_czech(input);
return transcript_devanagari_to_czech(input, out);
return transliterate_devanagari_to_latin(input);
return transliterate_devanagari_to_latin(input, out);
}
static int process_string(const char *input, unsigned int flags)
{
char *output;
int ret;
ret = process_input(input, &output, flags);
switch (ret) {
case 0:
fprintf(stdout, "%s", output);
break;
case EHINDI:
error("the input text is Hindi");
break;
default:
error("unexpected error");
break;
}
free(output);
return ret;
}
#define CHUNKSIZE 1024
@ -135,7 +159,7 @@ int main(int argc, const char **argv)
const char *files[argc];
unsigned int nfiles = 0;
unsigned int flags = 0;
char *input, *output;
char *input;
if (argc == 1) {
print_usage();
@ -178,9 +202,10 @@ int main(int argc, const char **argv)
while (optind < argc) {
const char *arg = argv[optind++];
output = process_input(arg, flags);
fprintf(stdout, "%s\n", output);
free(output);
retval = process_string(arg, flags);
if (retval != 0)
return retval;
putchar('\n');
}
for (i = 0; i < nfiles; i++) {
@ -194,11 +219,10 @@ int main(int argc, const char **argv)
error("failed to read file '%s'.", files[i]);
return retval;
}
output = process_input(input, flags);
fprintf(stdout, "%s", output);
free(output);
retval = process_string(input, flags);
free(input);
if (retval != 0)
return retval;
}
return 0;

View file

@ -1,10 +1,15 @@
#include "test.h"
#include "transcript.h"
#include "../compat.h"
#include "../transcription.h"
static void test_transcript(const char *devanagari, const char *latin)
{
char *czech = transcript_devanagari_to_czech(devanagari);
char *czech;
int ret;
ret = transcript_devanagari_to_czech(devanagari, &czech);
ck_assert_int_eq(0, ret);
ck_assert_str_eq(latin, czech);
free(czech);
}
@ -27,7 +32,18 @@ START_TEST(test_transcript_devanagari_to_czech)
}
END_TEST
START_TEST(test_transcript_detect_hindi)
{
char *czech;
int ret;
ret = transcript_devanagari_to_czech("लड़की", &czech);
ck_assert_int_eq(EHINDI, ret);
}
END_TEST
void register_transcript_tests(TCase *test_case)
{
tcase_add_test(test_case, test_transcript_devanagari_to_czech);
tcase_add_test(test_case, test_transcript_detect_hindi);
}

View file

@ -1,11 +1,16 @@
#include "test.h"
#include "translit.h"
#include "../compat.h"
#include "../transliteration.h"
static void test_transliterate_devanagari_to_latin(const char *devanagari,
const char *latin)
{
char *str = transliterate_devanagari_to_latin(devanagari);
char *str;
int ret;
ret = transliterate_devanagari_to_latin(devanagari, &str);
ck_assert_int_eq(0, ret);
ck_assert_str_eq(latin, str);
free(str);
}
@ -13,7 +18,10 @@ static void test_transliterate_devanagari_to_latin(const char *devanagari,
static void test_transliterate_latin_to_devanagari(const char *latin,
const char *devanagari)
{
char *str = transliterate_latin_to_devanagari(latin);
char *str;
int ret;
ret = transliterate_latin_to_devanagari(latin, &str);
ck_assert_str_eq(devanagari, str);
free(str);
}
@ -65,10 +73,19 @@ START_TEST(test_translit_candrabindu)
}
END_TEST
START_TEST(test_translit_detect_hindi)
{
char *hindi = NULL;
int ret = transliterate_devanagari_to_latin("लड़की", &hindi);
ck_assert_int_eq(EHINDI, ret);
}
END_TEST
void register_translit_tests(TCase *test_case)
{
tcase_add_test(test_case, test_translit_devanagari_to_latin);
tcase_add_test(test_case, test_translit_latin_to_devanagari);
tcase_add_test(test_case, test_translit_lla_sylable);
tcase_add_test(test_case, test_translit_candrabindu);
tcase_add_test(test_case, test_translit_detect_hindi);
}

View file

@ -4,7 +4,8 @@
static void test_encoding(const char *in, const char *expected)
{
char *iast = encode_velthuis_to_iast_punctation(in);
char *iast;
encode_velthuis_to_iast_punctation(in, &iast);
ck_assert_str_eq(expected, iast);
free(iast);
}

View file

@ -6,8 +6,10 @@
#include "iast-czech.h"
#include "utf8.h"
#define CHUNKSIZE 1024
#define SCHWA_CHARACTER 'a'
#define VIRAMA 0x094d
#define NUKTA 0x093c
#define CHUNKSIZE 1024
static inline int is_consonant(unsigned int c)
{
@ -58,7 +60,7 @@ static void end_of_word_filter(char *latin, unsigned int *pos,
/* remove singular nominative suffix */
len = utf8_char_length(c);
if (prev == 0x094d && *(latin + *pos - 1 - len) == 'm') {
if (prev == VIRAMA && *(latin + *pos - 1 - len) == 'm') {
memmove(latin + *pos - 1 - len, latin + *pos - len, c);
*pos = *pos - 1;
}
@ -77,7 +79,7 @@ static struct translit_letter *letter_by_code(struct translit_letter *table,
return NULL;
}
char *transcript_devanagari_to_czech(const char *devanagari)
int transcript_devanagari_to_czech(const char *devanagari, char **ret)
{
struct translit_letter *table, *letter;
unsigned int c, prev = 0, alloc = 0, done = 0, len;
@ -98,6 +100,11 @@ char *transcript_devanagari_to_czech(const char *devanagari)
nasal_consonants_filter(latin, &done, prev, c);
if (c == NUKTA) {
*ret = NULL;
return EHINDI;
}
letter = letter_by_code(table, c);
if (letter) {
switch (letter->type) {
@ -107,11 +114,10 @@ char *transcript_devanagari_to_czech(const char *devanagari)
*(latin + done++) = SCHWA_CHARACTER;
break;
case VOWEL_SIGN:
if (done)
if (done) {
/* delete the inherent schwa */
done--;
strcpy(latin + done, letter->data);
done += strlen(letter->data);
break;
}
default:
strcpy(latin + done, letter->data);
done += strlen(letter->data);
@ -132,5 +138,7 @@ char *transcript_devanagari_to_czech(const char *devanagari)
*(latin + done - 1) = '\0';
return latin;
*ret = latin;
return 0;
}

View file

@ -3,6 +3,6 @@
#ifndef __TRANSCRIPTION_H
#define __TRANSCRIPTION_H
char *transcript_devanagari_to_czech(const char *devanagari);
int transcript_devanagari_to_czech(const char *devanagari, char **ret);
#endif /* __TRANSCRIPTION_H */

View file

@ -7,6 +7,7 @@
#define SCHWA_CHARACTER 'a'
#define VIRAMA 0x094d
#define NUKTA 0x093c
#define CHUNKSIZE 1024
static struct translit_letter *letter_by_code(struct translit_letter *table,
@ -21,7 +22,7 @@ static struct translit_letter *letter_by_code(struct translit_letter *table,
return NULL;
}
char *transliterate_devanagari_to_latin(const char *devanagari)
int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
{
struct translit_letter *table, *letter;
unsigned int c, alloc = 0, done = 0, len;
@ -40,6 +41,11 @@ char *transliterate_devanagari_to_latin(const char *devanagari)
len = utf8_char_length(c);
src += len;
if (c == NUKTA) {
*ret = NULL;
return EHINDI;
}
letter = letter_by_code(table, c);
if (letter) {
switch (letter->type) {
@ -49,11 +55,10 @@ char *transliterate_devanagari_to_latin(const char *devanagari)
*(latin + done++) = SCHWA_CHARACTER;
break;
case VOWEL_SIGN:
if (done)
if (done) {
/* delete the inherent schwa */
done--;
strcpy(latin + done, letter->data);
done += strlen(letter->data);
break;
}
default:
strcpy(latin + done, letter->data);
done += strlen(letter->data);
@ -68,7 +73,9 @@ char *transliterate_devanagari_to_latin(const char *devanagari)
break;
}
return latin;
*ret = latin;
return 0;
}
static struct translit_letter *letter_by_data(struct translit_letter *table,
@ -98,7 +105,7 @@ static struct translit_letter *vowel_sign_by_data(struct translit_letter *table,
return NULL;
}
char *transliterate_latin_to_devanagari(const char *latin)
int transliterate_latin_to_devanagari(const char *latin, char **ret)
{
struct translit_letter *table, *letter, *next;
unsigned int alloc = 0, done = 0, len;
@ -165,5 +172,7 @@ encode_vowel_modifier:
if (devanagari)
devanagari[done] = '\0';
return devanagari;
*ret = devanagari;
return 0;
}

View file

@ -22,8 +22,8 @@ struct translit_context {
struct translit_letter *table;
};
char *transliterate_devanagari_to_latin(const char *text);
char *transliterate_latin_to_devanagari(const char *text);
int transliterate_devanagari_to_latin(const char *text, char **ret);
int transliterate_latin_to_devanagari(const char *text, char **ret);
static inline int is_devanagari(unsigned int code)
{

View file

@ -55,7 +55,7 @@ static const struct encoder_tuple *find_tuple(const char *text)
return NULL;
}
char *encode_velthuis_to_iast_punctation(const char *text)
int encode_velthuis_to_iast_punctation(const char *text, char **out)
{
const char *str = text, *end = str + strlen(str);
const struct encoder_tuple *tuple;
@ -79,5 +79,7 @@ char *encode_velthuis_to_iast_punctation(const char *text)
}
}
return buf;
*out = buf;
return 0;
}

View file

@ -3,6 +3,6 @@
#ifndef __VELTHUIS_H
#define __VELTHUIS_H
char *encode_velthuis_to_iast_punctation(const char *text);
int encode_velthuis_to_iast_punctation(const char *text, char **out);
#endif /* __VELTHUIS_H */