refuse to transliterate Hindi-specific characters
This commit is contained in:
parent
8db12fd79e
commit
739c7ea462
11 changed files with 122 additions and 41 deletions
4
compat.h
4
compat.h
|
@ -10,4 +10,8 @@
|
||||||
#include <getopt.h>
|
#include <getopt.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
|
||||||
|
enum err {
|
||||||
|
EHINDI = 1
|
||||||
|
};
|
||||||
|
|
||||||
#endif /* __COMPAT_H */
|
#endif /* __COMPAT_H */
|
||||||
|
|
50
main.c
50
main.c
|
@ -68,18 +68,42 @@ static void error(const char *msg, ...)
|
||||||
va_end(params);
|
va_end(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
static char *process_input(const char *input, unsigned int flags)
|
static int process_input(const char *input, char **out, unsigned int flags)
|
||||||
{
|
{
|
||||||
if (flags & FLAG_REVERSE)
|
if (flags & FLAG_REVERSE)
|
||||||
return transliterate_latin_to_devanagari(input);
|
return transliterate_latin_to_devanagari(input, out);
|
||||||
|
|
||||||
if (flags & FLAG_VELTHUIS)
|
if (flags & FLAG_VELTHUIS)
|
||||||
return encode_velthuis_to_iast_punctation(input);
|
return encode_velthuis_to_iast_punctation(input, out);
|
||||||
|
|
||||||
if (flags & FLAG_CZECH)
|
if (flags & FLAG_CZECH)
|
||||||
return transcript_devanagari_to_czech(input);
|
return transcript_devanagari_to_czech(input, out);
|
||||||
|
|
||||||
return transliterate_devanagari_to_latin(input);
|
return transliterate_devanagari_to_latin(input, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int process_string(const char *input, unsigned int flags)
|
||||||
|
{
|
||||||
|
char *output;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = process_input(input, &output, flags);
|
||||||
|
|
||||||
|
switch (ret) {
|
||||||
|
case 0:
|
||||||
|
fprintf(stdout, "%s", output);
|
||||||
|
break;
|
||||||
|
case EHINDI:
|
||||||
|
error("the input text is Hindi");
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
error("unexpected error");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(output);
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CHUNKSIZE 1024
|
#define CHUNKSIZE 1024
|
||||||
|
@ -135,7 +159,7 @@ int main(int argc, const char **argv)
|
||||||
const char *files[argc];
|
const char *files[argc];
|
||||||
unsigned int nfiles = 0;
|
unsigned int nfiles = 0;
|
||||||
unsigned int flags = 0;
|
unsigned int flags = 0;
|
||||||
char *input, *output;
|
char *input;
|
||||||
|
|
||||||
if (argc == 1) {
|
if (argc == 1) {
|
||||||
print_usage();
|
print_usage();
|
||||||
|
@ -178,9 +202,10 @@ int main(int argc, const char **argv)
|
||||||
while (optind < argc) {
|
while (optind < argc) {
|
||||||
const char *arg = argv[optind++];
|
const char *arg = argv[optind++];
|
||||||
|
|
||||||
output = process_input(arg, flags);
|
retval = process_string(arg, flags);
|
||||||
fprintf(stdout, "%s\n", output);
|
if (retval != 0)
|
||||||
free(output);
|
return retval;
|
||||||
|
putchar('\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < nfiles; i++) {
|
for (i = 0; i < nfiles; i++) {
|
||||||
|
@ -194,11 +219,10 @@ int main(int argc, const char **argv)
|
||||||
error("failed to read file '%s'.", files[i]);
|
error("failed to read file '%s'.", files[i]);
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
retval = process_string(input, flags);
|
||||||
output = process_input(input, flags);
|
|
||||||
fprintf(stdout, "%s", output);
|
|
||||||
free(output);
|
|
||||||
free(input);
|
free(input);
|
||||||
|
if (retval != 0)
|
||||||
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -1,10 +1,15 @@
|
||||||
#include "test.h"
|
#include "test.h"
|
||||||
#include "transcript.h"
|
#include "transcript.h"
|
||||||
|
#include "../compat.h"
|
||||||
#include "../transcription.h"
|
#include "../transcription.h"
|
||||||
|
|
||||||
static void test_transcript(const char *devanagari, const char *latin)
|
static void test_transcript(const char *devanagari, const char *latin)
|
||||||
{
|
{
|
||||||
char *czech = transcript_devanagari_to_czech(devanagari);
|
char *czech;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = transcript_devanagari_to_czech(devanagari, &czech);
|
||||||
|
ck_assert_int_eq(0, ret);
|
||||||
ck_assert_str_eq(latin, czech);
|
ck_assert_str_eq(latin, czech);
|
||||||
free(czech);
|
free(czech);
|
||||||
}
|
}
|
||||||
|
@ -27,7 +32,18 @@ START_TEST(test_transcript_devanagari_to_czech)
|
||||||
}
|
}
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
|
START_TEST(test_transcript_detect_hindi)
|
||||||
|
{
|
||||||
|
char *czech;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = transcript_devanagari_to_czech("लड़की", &czech);
|
||||||
|
ck_assert_int_eq(EHINDI, ret);
|
||||||
|
}
|
||||||
|
END_TEST
|
||||||
|
|
||||||
void register_transcript_tests(TCase *test_case)
|
void register_transcript_tests(TCase *test_case)
|
||||||
{
|
{
|
||||||
tcase_add_test(test_case, test_transcript_devanagari_to_czech);
|
tcase_add_test(test_case, test_transcript_devanagari_to_czech);
|
||||||
|
tcase_add_test(test_case, test_transcript_detect_hindi);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,16 @@
|
||||||
#include "test.h"
|
#include "test.h"
|
||||||
#include "translit.h"
|
#include "translit.h"
|
||||||
|
#include "../compat.h"
|
||||||
#include "../transliteration.h"
|
#include "../transliteration.h"
|
||||||
|
|
||||||
static void test_transliterate_devanagari_to_latin(const char *devanagari,
|
static void test_transliterate_devanagari_to_latin(const char *devanagari,
|
||||||
const char *latin)
|
const char *latin)
|
||||||
{
|
{
|
||||||
char *str = transliterate_devanagari_to_latin(devanagari);
|
char *str;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = transliterate_devanagari_to_latin(devanagari, &str);
|
||||||
|
ck_assert_int_eq(0, ret);
|
||||||
ck_assert_str_eq(latin, str);
|
ck_assert_str_eq(latin, str);
|
||||||
free(str);
|
free(str);
|
||||||
}
|
}
|
||||||
|
@ -13,7 +18,10 @@ static void test_transliterate_devanagari_to_latin(const char *devanagari,
|
||||||
static void test_transliterate_latin_to_devanagari(const char *latin,
|
static void test_transliterate_latin_to_devanagari(const char *latin,
|
||||||
const char *devanagari)
|
const char *devanagari)
|
||||||
{
|
{
|
||||||
char *str = transliterate_latin_to_devanagari(latin);
|
char *str;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = transliterate_latin_to_devanagari(latin, &str);
|
||||||
ck_assert_str_eq(devanagari, str);
|
ck_assert_str_eq(devanagari, str);
|
||||||
free(str);
|
free(str);
|
||||||
}
|
}
|
||||||
|
@ -65,10 +73,19 @@ START_TEST(test_translit_candrabindu)
|
||||||
}
|
}
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
|
START_TEST(test_translit_detect_hindi)
|
||||||
|
{
|
||||||
|
char *hindi = NULL;
|
||||||
|
int ret = transliterate_devanagari_to_latin("लड़की", &hindi);
|
||||||
|
ck_assert_int_eq(EHINDI, ret);
|
||||||
|
}
|
||||||
|
END_TEST
|
||||||
|
|
||||||
void register_translit_tests(TCase *test_case)
|
void register_translit_tests(TCase *test_case)
|
||||||
{
|
{
|
||||||
tcase_add_test(test_case, test_translit_devanagari_to_latin);
|
tcase_add_test(test_case, test_translit_devanagari_to_latin);
|
||||||
tcase_add_test(test_case, test_translit_latin_to_devanagari);
|
tcase_add_test(test_case, test_translit_latin_to_devanagari);
|
||||||
tcase_add_test(test_case, test_translit_lla_sylable);
|
tcase_add_test(test_case, test_translit_lla_sylable);
|
||||||
tcase_add_test(test_case, test_translit_candrabindu);
|
tcase_add_test(test_case, test_translit_candrabindu);
|
||||||
|
tcase_add_test(test_case, test_translit_detect_hindi);
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,8 @@
|
||||||
|
|
||||||
static void test_encoding(const char *in, const char *expected)
|
static void test_encoding(const char *in, const char *expected)
|
||||||
{
|
{
|
||||||
char *iast = encode_velthuis_to_iast_punctation(in);
|
char *iast;
|
||||||
|
encode_velthuis_to_iast_punctation(in, &iast);
|
||||||
ck_assert_str_eq(expected, iast);
|
ck_assert_str_eq(expected, iast);
|
||||||
free(iast);
|
free(iast);
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,8 +6,10 @@
|
||||||
#include "iast-czech.h"
|
#include "iast-czech.h"
|
||||||
#include "utf8.h"
|
#include "utf8.h"
|
||||||
|
|
||||||
#define CHUNKSIZE 1024
|
|
||||||
#define SCHWA_CHARACTER 'a'
|
#define SCHWA_CHARACTER 'a'
|
||||||
|
#define VIRAMA 0x094d
|
||||||
|
#define NUKTA 0x093c
|
||||||
|
#define CHUNKSIZE 1024
|
||||||
|
|
||||||
static inline int is_consonant(unsigned int c)
|
static inline int is_consonant(unsigned int c)
|
||||||
{
|
{
|
||||||
|
@ -58,7 +60,7 @@ static void end_of_word_filter(char *latin, unsigned int *pos,
|
||||||
|
|
||||||
/* remove singular nominative suffix */
|
/* remove singular nominative suffix */
|
||||||
len = utf8_char_length(c);
|
len = utf8_char_length(c);
|
||||||
if (prev == 0x094d && *(latin + *pos - 1 - len) == 'm') {
|
if (prev == VIRAMA && *(latin + *pos - 1 - len) == 'm') {
|
||||||
memmove(latin + *pos - 1 - len, latin + *pos - len, c);
|
memmove(latin + *pos - 1 - len, latin + *pos - len, c);
|
||||||
*pos = *pos - 1;
|
*pos = *pos - 1;
|
||||||
}
|
}
|
||||||
|
@ -77,7 +79,7 @@ static struct translit_letter *letter_by_code(struct translit_letter *table,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *transcript_devanagari_to_czech(const char *devanagari)
|
int transcript_devanagari_to_czech(const char *devanagari, char **ret)
|
||||||
{
|
{
|
||||||
struct translit_letter *table, *letter;
|
struct translit_letter *table, *letter;
|
||||||
unsigned int c, prev = 0, alloc = 0, done = 0, len;
|
unsigned int c, prev = 0, alloc = 0, done = 0, len;
|
||||||
|
@ -98,6 +100,11 @@ char *transcript_devanagari_to_czech(const char *devanagari)
|
||||||
|
|
||||||
nasal_consonants_filter(latin, &done, prev, c);
|
nasal_consonants_filter(latin, &done, prev, c);
|
||||||
|
|
||||||
|
if (c == NUKTA) {
|
||||||
|
*ret = NULL;
|
||||||
|
return EHINDI;
|
||||||
|
}
|
||||||
|
|
||||||
letter = letter_by_code(table, c);
|
letter = letter_by_code(table, c);
|
||||||
if (letter) {
|
if (letter) {
|
||||||
switch (letter->type) {
|
switch (letter->type) {
|
||||||
|
@ -107,11 +114,10 @@ char *transcript_devanagari_to_czech(const char *devanagari)
|
||||||
*(latin + done++) = SCHWA_CHARACTER;
|
*(latin + done++) = SCHWA_CHARACTER;
|
||||||
break;
|
break;
|
||||||
case VOWEL_SIGN:
|
case VOWEL_SIGN:
|
||||||
if (done)
|
if (done) {
|
||||||
|
/* delete the inherent schwa */
|
||||||
done--;
|
done--;
|
||||||
strcpy(latin + done, letter->data);
|
}
|
||||||
done += strlen(letter->data);
|
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
strcpy(latin + done, letter->data);
|
strcpy(latin + done, letter->data);
|
||||||
done += strlen(letter->data);
|
done += strlen(letter->data);
|
||||||
|
@ -132,5 +138,7 @@ char *transcript_devanagari_to_czech(const char *devanagari)
|
||||||
|
|
||||||
*(latin + done - 1) = '\0';
|
*(latin + done - 1) = '\0';
|
||||||
|
|
||||||
return latin;
|
*ret = latin;
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,6 @@
|
||||||
#ifndef __TRANSCRIPTION_H
|
#ifndef __TRANSCRIPTION_H
|
||||||
#define __TRANSCRIPTION_H
|
#define __TRANSCRIPTION_H
|
||||||
|
|
||||||
char *transcript_devanagari_to_czech(const char *devanagari);
|
int transcript_devanagari_to_czech(const char *devanagari, char **ret);
|
||||||
|
|
||||||
#endif /* __TRANSCRIPTION_H */
|
#endif /* __TRANSCRIPTION_H */
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
|
|
||||||
#define SCHWA_CHARACTER 'a'
|
#define SCHWA_CHARACTER 'a'
|
||||||
#define VIRAMA 0x094d
|
#define VIRAMA 0x094d
|
||||||
|
#define NUKTA 0x093c
|
||||||
#define CHUNKSIZE 1024
|
#define CHUNKSIZE 1024
|
||||||
|
|
||||||
static struct translit_letter *letter_by_code(struct translit_letter *table,
|
static struct translit_letter *letter_by_code(struct translit_letter *table,
|
||||||
|
@ -21,7 +22,7 @@ static struct translit_letter *letter_by_code(struct translit_letter *table,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *transliterate_devanagari_to_latin(const char *devanagari)
|
int transliterate_devanagari_to_latin(const char *devanagari, char **ret)
|
||||||
{
|
{
|
||||||
struct translit_letter *table, *letter;
|
struct translit_letter *table, *letter;
|
||||||
unsigned int c, alloc = 0, done = 0, len;
|
unsigned int c, alloc = 0, done = 0, len;
|
||||||
|
@ -40,6 +41,11 @@ char *transliterate_devanagari_to_latin(const char *devanagari)
|
||||||
len = utf8_char_length(c);
|
len = utf8_char_length(c);
|
||||||
src += len;
|
src += len;
|
||||||
|
|
||||||
|
if (c == NUKTA) {
|
||||||
|
*ret = NULL;
|
||||||
|
return EHINDI;
|
||||||
|
}
|
||||||
|
|
||||||
letter = letter_by_code(table, c);
|
letter = letter_by_code(table, c);
|
||||||
if (letter) {
|
if (letter) {
|
||||||
switch (letter->type) {
|
switch (letter->type) {
|
||||||
|
@ -49,11 +55,10 @@ char *transliterate_devanagari_to_latin(const char *devanagari)
|
||||||
*(latin + done++) = SCHWA_CHARACTER;
|
*(latin + done++) = SCHWA_CHARACTER;
|
||||||
break;
|
break;
|
||||||
case VOWEL_SIGN:
|
case VOWEL_SIGN:
|
||||||
if (done)
|
if (done) {
|
||||||
|
/* delete the inherent schwa */
|
||||||
done--;
|
done--;
|
||||||
strcpy(latin + done, letter->data);
|
}
|
||||||
done += strlen(letter->data);
|
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
strcpy(latin + done, letter->data);
|
strcpy(latin + done, letter->data);
|
||||||
done += strlen(letter->data);
|
done += strlen(letter->data);
|
||||||
|
@ -68,7 +73,9 @@ char *transliterate_devanagari_to_latin(const char *devanagari)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return latin;
|
*ret = latin;
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct translit_letter *letter_by_data(struct translit_letter *table,
|
static struct translit_letter *letter_by_data(struct translit_letter *table,
|
||||||
|
@ -98,7 +105,7 @@ static struct translit_letter *vowel_sign_by_data(struct translit_letter *table,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *transliterate_latin_to_devanagari(const char *latin)
|
int transliterate_latin_to_devanagari(const char *latin, char **ret)
|
||||||
{
|
{
|
||||||
struct translit_letter *table, *letter, *next;
|
struct translit_letter *table, *letter, *next;
|
||||||
unsigned int alloc = 0, done = 0, len;
|
unsigned int alloc = 0, done = 0, len;
|
||||||
|
@ -165,5 +172,7 @@ encode_vowel_modifier:
|
||||||
if (devanagari)
|
if (devanagari)
|
||||||
devanagari[done] = '\0';
|
devanagari[done] = '\0';
|
||||||
|
|
||||||
return devanagari;
|
*ret = devanagari;
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,8 +22,8 @@ struct translit_context {
|
||||||
struct translit_letter *table;
|
struct translit_letter *table;
|
||||||
};
|
};
|
||||||
|
|
||||||
char *transliterate_devanagari_to_latin(const char *text);
|
int transliterate_devanagari_to_latin(const char *text, char **ret);
|
||||||
char *transliterate_latin_to_devanagari(const char *text);
|
int transliterate_latin_to_devanagari(const char *text, char **ret);
|
||||||
|
|
||||||
static inline int is_devanagari(unsigned int code)
|
static inline int is_devanagari(unsigned int code)
|
||||||
{
|
{
|
||||||
|
|
|
@ -55,7 +55,7 @@ static const struct encoder_tuple *find_tuple(const char *text)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *encode_velthuis_to_iast_punctation(const char *text)
|
int encode_velthuis_to_iast_punctation(const char *text, char **out)
|
||||||
{
|
{
|
||||||
const char *str = text, *end = str + strlen(str);
|
const char *str = text, *end = str + strlen(str);
|
||||||
const struct encoder_tuple *tuple;
|
const struct encoder_tuple *tuple;
|
||||||
|
@ -79,5 +79,7 @@ char *encode_velthuis_to_iast_punctation(const char *text)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return buf;
|
*out = buf;
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,6 @@
|
||||||
#ifndef __VELTHUIS_H
|
#ifndef __VELTHUIS_H
|
||||||
#define __VELTHUIS_H
|
#define __VELTHUIS_H
|
||||||
|
|
||||||
char *encode_velthuis_to_iast_punctation(const char *text);
|
int encode_velthuis_to_iast_punctation(const char *text, char **out);
|
||||||
|
|
||||||
#endif /* __VELTHUIS_H */
|
#endif /* __VELTHUIS_H */
|
||||||
|
|
Loading…
Reference in a new issue