refuse to transliterate Hindi-specific characters
This commit is contained in:
parent
8db12fd79e
commit
739c7ea462
11 changed files with 122 additions and 41 deletions
4
compat.h
4
compat.h
|
@ -10,4 +10,8 @@
|
|||
#include <getopt.h>
|
||||
#include <errno.h>
|
||||
|
||||
enum err {
|
||||
EHINDI = 1
|
||||
};
|
||||
|
||||
#endif /* __COMPAT_H */
|
||||
|
|
52
main.c
52
main.c
|
@ -68,18 +68,42 @@ static void error(const char *msg, ...)
|
|||
va_end(params);
|
||||
}
|
||||
|
||||
static char *process_input(const char *input, unsigned int flags)
|
||||
static int process_input(const char *input, char **out, unsigned int flags)
|
||||
{
|
||||
if (flags & FLAG_REVERSE)
|
||||
return transliterate_latin_to_devanagari(input);
|
||||
return transliterate_latin_to_devanagari(input, out);
|
||||
|
||||
if (flags & FLAG_VELTHUIS)
|
||||
return encode_velthuis_to_iast_punctation(input);
|
||||
return encode_velthuis_to_iast_punctation(input, out);
|
||||
|
||||
if (flags & FLAG_CZECH)
|
||||
return transcript_devanagari_to_czech(input);
|
||||
return transcript_devanagari_to_czech(input, out);
|
||||
|
||||
return transliterate_devanagari_to_latin(input);
|
||||
return transliterate_devanagari_to_latin(input, out);
|
||||
}
|
||||
|
||||
static int process_string(const char *input, unsigned int flags)
|
||||
{
|
||||
char *output;
|
||||
int ret;
|
||||
|
||||
ret = process_input(input, &output, flags);
|
||||
|
||||
switch (ret) {
|
||||
case 0:
|
||||
fprintf(stdout, "%s", output);
|
||||
break;
|
||||
case EHINDI:
|
||||
error("the input text is Hindi");
|
||||
break;
|
||||
default:
|
||||
error("unexpected error");
|
||||
break;
|
||||
}
|
||||
|
||||
free(output);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define CHUNKSIZE 1024
|
||||
|
@ -135,7 +159,7 @@ int main(int argc, const char **argv)
|
|||
const char *files[argc];
|
||||
unsigned int nfiles = 0;
|
||||
unsigned int flags = 0;
|
||||
char *input, *output;
|
||||
char *input;
|
||||
|
||||
if (argc == 1) {
|
||||
print_usage();
|
||||
|
@ -178,10 +202,11 @@ int main(int argc, const char **argv)
|
|||
while (optind < argc) {
|
||||
const char *arg = argv[optind++];
|
||||
|
||||
output = process_input(arg, flags);
|
||||
fprintf(stdout, "%s\n", output);
|
||||
free(output);
|
||||
}
|
||||
retval = process_string(arg, flags);
|
||||
if (retval != 0)
|
||||
return retval;
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
for (i = 0; i < nfiles; i++) {
|
||||
if (strcmp(files[i], "-") == 0) {
|
||||
|
@ -194,11 +219,10 @@ int main(int argc, const char **argv)
|
|||
error("failed to read file '%s'.", files[i]);
|
||||
return retval;
|
||||
}
|
||||
|
||||
output = process_input(input, flags);
|
||||
fprintf(stdout, "%s", output);
|
||||
free(output);
|
||||
retval = process_string(input, flags);
|
||||
free(input);
|
||||
if (retval != 0)
|
||||
return retval;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -1,10 +1,15 @@
|
|||
#include "test.h"
|
||||
#include "transcript.h"
|
||||
#include "../compat.h"
|
||||
#include "../transcription.h"
|
||||
|
||||
static void test_transcript(const char *devanagari, const char *latin)
|
||||
{
|
||||
char *czech = transcript_devanagari_to_czech(devanagari);
|
||||
char *czech;
|
||||
int ret;
|
||||
|
||||
ret = transcript_devanagari_to_czech(devanagari, &czech);
|
||||
ck_assert_int_eq(0, ret);
|
||||
ck_assert_str_eq(latin, czech);
|
||||
free(czech);
|
||||
}
|
||||
|
@ -27,7 +32,18 @@ START_TEST(test_transcript_devanagari_to_czech)
|
|||
}
|
||||
END_TEST
|
||||
|
||||
START_TEST(test_transcript_detect_hindi)
|
||||
{
|
||||
char *czech;
|
||||
int ret;
|
||||
|
||||
ret = transcript_devanagari_to_czech("लड़की", &czech);
|
||||
ck_assert_int_eq(EHINDI, ret);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
void register_transcript_tests(TCase *test_case)
|
||||
{
|
||||
tcase_add_test(test_case, test_transcript_devanagari_to_czech);
|
||||
tcase_add_test(test_case, test_transcript_detect_hindi);
|
||||
}
|
||||
|
|
|
@ -1,11 +1,16 @@
|
|||
#include "test.h"
|
||||
#include "translit.h"
|
||||
#include "../compat.h"
|
||||
#include "../transliteration.h"
|
||||
|
||||
static void test_transliterate_devanagari_to_latin(const char *devanagari,
|
||||
const char *latin)
|
||||
{
|
||||
char *str = transliterate_devanagari_to_latin(devanagari);
|
||||
char *str;
|
||||
int ret;
|
||||
|
||||
ret = transliterate_devanagari_to_latin(devanagari, &str);
|
||||
ck_assert_int_eq(0, ret);
|
||||
ck_assert_str_eq(latin, str);
|
||||
free(str);
|
||||
}
|
||||
|
@ -13,7 +18,10 @@ static void test_transliterate_devanagari_to_latin(const char *devanagari,
|
|||
static void test_transliterate_latin_to_devanagari(const char *latin,
|
||||
const char *devanagari)
|
||||
{
|
||||
char *str = transliterate_latin_to_devanagari(latin);
|
||||
char *str;
|
||||
int ret;
|
||||
|
||||
ret = transliterate_latin_to_devanagari(latin, &str);
|
||||
ck_assert_str_eq(devanagari, str);
|
||||
free(str);
|
||||
}
|
||||
|
@ -65,10 +73,19 @@ START_TEST(test_translit_candrabindu)
|
|||
}
|
||||
END_TEST
|
||||
|
||||
START_TEST(test_translit_detect_hindi)
|
||||
{
|
||||
char *hindi = NULL;
|
||||
int ret = transliterate_devanagari_to_latin("लड़की", &hindi);
|
||||
ck_assert_int_eq(EHINDI, ret);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
void register_translit_tests(TCase *test_case)
|
||||
{
|
||||
tcase_add_test(test_case, test_translit_devanagari_to_latin);
|
||||
tcase_add_test(test_case, test_translit_latin_to_devanagari);
|
||||
tcase_add_test(test_case, test_translit_lla_sylable);
|
||||
tcase_add_test(test_case, test_translit_candrabindu);
|
||||
tcase_add_test(test_case, test_translit_detect_hindi);
|
||||
}
|
||||
|
|
|
@ -4,7 +4,8 @@
|
|||
|
||||
static void test_encoding(const char *in, const char *expected)
|
||||
{
|
||||
char *iast = encode_velthuis_to_iast_punctation(in);
|
||||
char *iast;
|
||||
encode_velthuis_to_iast_punctation(in, &iast);
|
||||
ck_assert_str_eq(expected, iast);
|
||||
free(iast);
|
||||
}
|
||||
|
|
|
@ -6,8 +6,10 @@
|
|||
#include "iast-czech.h"
|
||||
#include "utf8.h"
|
||||
|
||||
#define CHUNKSIZE 1024
|
||||
#define SCHWA_CHARACTER 'a'
|
||||
#define VIRAMA 0x094d
|
||||
#define NUKTA 0x093c
|
||||
#define CHUNKSIZE 1024
|
||||
|
||||
static inline int is_consonant(unsigned int c)
|
||||
{
|
||||
|
@ -58,7 +60,7 @@ static void end_of_word_filter(char *latin, unsigned int *pos,
|
|||
|
||||
/* remove singular nominative suffix */
|
||||
len = utf8_char_length(c);
|
||||
if (prev == 0x094d && *(latin + *pos - 1 - len) == 'm') {
|
||||
if (prev == VIRAMA && *(latin + *pos - 1 - len) == 'm') {
|
||||
memmove(latin + *pos - 1 - len, latin + *pos - len, c);
|
||||
*pos = *pos - 1;
|
||||
}
|
||||
|
@ -77,7 +79,7 @@ static struct translit_letter *letter_by_code(struct translit_letter *table,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
char *transcript_devanagari_to_czech(const char *devanagari)
|
||||
int transcript_devanagari_to_czech(const char *devanagari, char **ret)
|
||||
{
|
||||
struct translit_letter *table, *letter;
|
||||
unsigned int c, prev = 0, alloc = 0, done = 0, len;
|
||||
|
@ -98,6 +100,11 @@ char *transcript_devanagari_to_czech(const char *devanagari)
|
|||
|
||||
nasal_consonants_filter(latin, &done, prev, c);
|
||||
|
||||
if (c == NUKTA) {
|
||||
*ret = NULL;
|
||||
return EHINDI;
|
||||
}
|
||||
|
||||
letter = letter_by_code(table, c);
|
||||
if (letter) {
|
||||
switch (letter->type) {
|
||||
|
@ -107,11 +114,10 @@ char *transcript_devanagari_to_czech(const char *devanagari)
|
|||
*(latin + done++) = SCHWA_CHARACTER;
|
||||
break;
|
||||
case VOWEL_SIGN:
|
||||
if (done)
|
||||
if (done) {
|
||||
/* delete the inherent schwa */
|
||||
done--;
|
||||
strcpy(latin + done, letter->data);
|
||||
done += strlen(letter->data);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
strcpy(latin + done, letter->data);
|
||||
done += strlen(letter->data);
|
||||
|
@ -132,5 +138,7 @@ char *transcript_devanagari_to_czech(const char *devanagari)
|
|||
|
||||
*(latin + done - 1) = '\0';
|
||||
|
||||
return latin;
|
||||
*ret = latin;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -3,6 +3,6 @@
|
|||
#ifndef __TRANSCRIPTION_H
|
||||
#define __TRANSCRIPTION_H
|
||||
|
||||
char *transcript_devanagari_to_czech(const char *devanagari);
|
||||
int transcript_devanagari_to_czech(const char *devanagari, char **ret);
|
||||
|
||||
#endif /* __TRANSCRIPTION_H */
|
||||
|
|
|
@ -6,7 +6,8 @@
|
|||
#include "utf8.h"
|
||||
|
||||
#define SCHWA_CHARACTER 'a'
|
||||
#define VIRAMA 0x094d
|
||||
#define VIRAMA 0x094d
|
||||
#define NUKTA 0x093c
|
||||
#define CHUNKSIZE 1024
|
||||
|
||||
static struct translit_letter *letter_by_code(struct translit_letter *table,
|
||||
|
@ -21,7 +22,7 @@ static struct translit_letter *letter_by_code(struct translit_letter *table,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
char *transliterate_devanagari_to_latin(const char *devanagari)
|
||||
int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
|
||||
{
|
||||
struct translit_letter *table, *letter;
|
||||
unsigned int c, alloc = 0, done = 0, len;
|
||||
|
@ -40,6 +41,11 @@ char *transliterate_devanagari_to_latin(const char *devanagari)
|
|||
len = utf8_char_length(c);
|
||||
src += len;
|
||||
|
||||
if (c == NUKTA) {
|
||||
*ret = NULL;
|
||||
return EHINDI;
|
||||
}
|
||||
|
||||
letter = letter_by_code(table, c);
|
||||
if (letter) {
|
||||
switch (letter->type) {
|
||||
|
@ -49,11 +55,10 @@ char *transliterate_devanagari_to_latin(const char *devanagari)
|
|||
*(latin + done++) = SCHWA_CHARACTER;
|
||||
break;
|
||||
case VOWEL_SIGN:
|
||||
if (done)
|
||||
if (done) {
|
||||
/* delete the inherent schwa */
|
||||
done--;
|
||||
strcpy(latin + done, letter->data);
|
||||
done += strlen(letter->data);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
strcpy(latin + done, letter->data);
|
||||
done += strlen(letter->data);
|
||||
|
@ -68,7 +73,9 @@ char *transliterate_devanagari_to_latin(const char *devanagari)
|
|||
break;
|
||||
}
|
||||
|
||||
return latin;
|
||||
*ret = latin;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct translit_letter *letter_by_data(struct translit_letter *table,
|
||||
|
@ -98,7 +105,7 @@ static struct translit_letter *vowel_sign_by_data(struct translit_letter *table,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
char *transliterate_latin_to_devanagari(const char *latin)
|
||||
int transliterate_latin_to_devanagari(const char *latin, char **ret)
|
||||
{
|
||||
struct translit_letter *table, *letter, *next;
|
||||
unsigned int alloc = 0, done = 0, len;
|
||||
|
@ -165,5 +172,7 @@ encode_vowel_modifier:
|
|||
if (devanagari)
|
||||
devanagari[done] = '\0';
|
||||
|
||||
return devanagari;
|
||||
*ret = devanagari;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -22,8 +22,8 @@ struct translit_context {
|
|||
struct translit_letter *table;
|
||||
};
|
||||
|
||||
char *transliterate_devanagari_to_latin(const char *text);
|
||||
char *transliterate_latin_to_devanagari(const char *text);
|
||||
int transliterate_devanagari_to_latin(const char *text, char **ret);
|
||||
int transliterate_latin_to_devanagari(const char *text, char **ret);
|
||||
|
||||
static inline int is_devanagari(unsigned int code)
|
||||
{
|
||||
|
|
|
@ -55,7 +55,7 @@ static const struct encoder_tuple *find_tuple(const char *text)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
char *encode_velthuis_to_iast_punctation(const char *text)
|
||||
int encode_velthuis_to_iast_punctation(const char *text, char **out)
|
||||
{
|
||||
const char *str = text, *end = str + strlen(str);
|
||||
const struct encoder_tuple *tuple;
|
||||
|
@ -79,5 +79,7 @@ char *encode_velthuis_to_iast_punctation(const char *text)
|
|||
}
|
||||
}
|
||||
|
||||
return buf;
|
||||
*out = buf;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -3,6 +3,6 @@
|
|||
#ifndef __VELTHUIS_H
|
||||
#define __VELTHUIS_H
|
||||
|
||||
char *encode_velthuis_to_iast_punctation(const char *text);
|
||||
int encode_velthuis_to_iast_punctation(const char *text, char **out);
|
||||
|
||||
#endif /* __VELTHUIS_H */
|
||||
|
|
Loading…
Reference in a new issue