reimplement czech transcription of sanskrit
This commit is contained in:
parent
412be221a7
commit
9c3db8ebb8
8 changed files with 300 additions and 2 deletions
4
Makefile
4
Makefile
|
@ -2,8 +2,8 @@ PREFIX=/usr
|
|||
|
||||
.PHONY: main test install uninstall clean
|
||||
|
||||
OBJECTS = iast.o transliteration.o utf8.o encoder.o
|
||||
TEST_OBJECTS = tests/main.o tests/translit.o
|
||||
OBJECTS = iast.o iast-czech.o transliteration.o transcription.o utf8.o encoder.o
|
||||
TEST_OBJECTS = tests/main.o tests/translit.o tests/transcript.o
|
||||
CFLAGS = -Wall
|
||||
LIBS =
|
||||
TEST_CFLAGS = $(CFLAGS) $(shell pkg-config --cflags check)
|
||||
|
|
101
iast-czech.c
Normal file
101
iast-czech.c
Normal file
|
@ -0,0 +1,101 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
|
||||
#include "compat.h"
|
||||
#include "iast-czech.h"
|
||||
|
||||
static struct translit_letter table[] = {
|
||||
|
||||
/* Special characters */
|
||||
{0x0950, SPECIAL, "óm"}, /* aum */
|
||||
|
||||
/* Vowels */
|
||||
{0x0910, VOWEL, "ai"}, /* 01 */
|
||||
{0x0914, VOWEL, "au"}, /* 02 */
|
||||
{0x0905, VOWEL, "a"}, /* 03 */
|
||||
{0x0906, VOWEL, "á"}, /* 04 */
|
||||
{0x0907, VOWEL, "i"}, /* 05 */
|
||||
{0x0908, VOWEL, "í"}, /* 06 */
|
||||
{0x0909, VOWEL, "u"}, /* 07 */
|
||||
{0x090a, VOWEL, "ú"}, /* 08 */
|
||||
{0x090b, VOWEL, "r"}, /* 09 */
|
||||
{0x0960, VOWEL, "r"}, /* 10 */
|
||||
{0x090c, VOWEL, "l"}, /* 11 */
|
||||
{0x0961, VOWEL, "l"}, /* 12 */
|
||||
{0x090f, VOWEL, "é"}, /* 13 */
|
||||
{0x0913, VOWEL, "ó"}, /* 14 */
|
||||
|
||||
/* Consonants */
|
||||
{0x0916, CONSONANT, "kh"}, /* 01 */
|
||||
{0x0918, CONSONANT, "gh"}, /* 02 */
|
||||
{0x091b, CONSONANT, "ch"}, /* 03 */
|
||||
{0x091d, CONSONANT, "džh"}, /* 04 */
|
||||
{0x091c, CONSONANT, "dž"}, /* 05 */
|
||||
{0x0920, CONSONANT, "th"}, /* 06 */
|
||||
{0x0922, CONSONANT, "dh"}, /* 07 */
|
||||
{0x0925, CONSONANT, "th"}, /* 08 */
|
||||
{0x0927, CONSONANT, "dh"}, /* 09 */
|
||||
{0x092b, CONSONANT, "ph"}, /* 10 */
|
||||
{0x092d, CONSONANT, "bh"}, /* 11 */
|
||||
{0x0915, CONSONANT, "k"}, /* 12 */
|
||||
{0x0917, CONSONANT, "g"}, /* 13 */
|
||||
{0x0919, CONSONANT, "n"}, /* 14 */
|
||||
{0x0939, CONSONANT, "h"}, /* 15 */
|
||||
{0x091a, CONSONANT, "c"}, /* 16 */
|
||||
{0x091e, CONSONANT, "ň"}, /* 17 */
|
||||
{0x092f, CONSONANT, "j"}, /* 18 */
|
||||
{0x0936, CONSONANT, "š"}, /* 19 */
|
||||
{0x091F, CONSONANT, "t"}, /* 20 */
|
||||
{0x0921, CONSONANT, "d"}, /* 21 */
|
||||
{0x0923, CONSONANT, "n"}, /* 22 */
|
||||
{0x0930, CONSONANT, "r"}, /* 23 */
|
||||
{0x0937, CONSONANT, "š"}, /* 24 */
|
||||
{0x0924, CONSONANT, "t"}, /* 25 */
|
||||
{0x0926, CONSONANT, "d"}, /* 26 */
|
||||
{0x0928, CONSONANT, "n"}, /* 27 */
|
||||
{0x0932, CONSONANT, "l"}, /* 28 */
|
||||
{0x0938, CONSONANT, "s"}, /* 29 */
|
||||
{0x092a, CONSONANT, "p"}, /* 30 */
|
||||
{0x092c, CONSONANT, "b"}, /* 31 */
|
||||
{0x092e, CONSONANT, "m"}, /* 32 */
|
||||
{0x0935, CONSONANT, "v"}, /* 33 */
|
||||
|
||||
/* Codas */
|
||||
{0x0902, CODA, "m"}, /* anusvara */
|
||||
{0x0903, CODA, ""}, /* visarga */
|
||||
{0x093d, CODA, "'"}, /* avagrada */
|
||||
|
||||
/* Numbers */
|
||||
{0x0966, NUMBER, "0"},
|
||||
{0x0967, NUMBER, "1"},
|
||||
{0x0968, NUMBER, "2"},
|
||||
{0x0969, NUMBER, "3"},
|
||||
{0x096a, NUMBER, "4"},
|
||||
{0x096b, NUMBER, "5"},
|
||||
{0x096c, NUMBER, "6"},
|
||||
{0x096d, NUMBER, "7"},
|
||||
{0x096e, NUMBER, "8"},
|
||||
{0x096f, NUMBER, "9"},
|
||||
|
||||
/* Diacritic modifiers */
|
||||
{0x0948, VOWEL_SIGN, "ai"},
|
||||
{0x094c, VOWEL_SIGN, "au"},
|
||||
{0x093e, VOWEL_SIGN, "á"},
|
||||
{0x093f, VOWEL_SIGN, "i"},
|
||||
{0x0940, VOWEL_SIGN, "í"},
|
||||
{0x0941, VOWEL_SIGN, "u"},
|
||||
{0x0942, VOWEL_SIGN, "ú"},
|
||||
{0x0943, VOWEL_SIGN, "r"},
|
||||
{0x0944, VOWEL_SIGN, "r"},
|
||||
{0x0962, VOWEL_SIGN, "l"},
|
||||
{0x0963, VOWEL_SIGN, "l"},
|
||||
{0x0947, VOWEL_SIGN, "é"},
|
||||
{0x094b, VOWEL_SIGN, "ó"},
|
||||
{0x094d, VOWEL_SIGN, ""}, /* virama */
|
||||
|
||||
{0, 0, NULL}
|
||||
};
|
||||
|
||||
struct translit_letter *get_iast_czech_transliteration_table()
|
||||
{
|
||||
return table;
|
||||
}
|
10
iast-czech.h
Normal file
10
iast-czech.h
Normal file
|
@ -0,0 +1,10 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
|
||||
#ifndef __IAST_CZECH_H
|
||||
#define __IAST_CZECH_H
|
||||
|
||||
#include "transliteration.h"
|
||||
|
||||
struct translit_letter *get_iast_czech_transliteration_table();
|
||||
|
||||
#endif /* __IAST_CZECH_H */
|
|
@ -1,5 +1,6 @@
|
|||
#include "test.h"
|
||||
#include "translit.h"
|
||||
#include "transcript.h"
|
||||
|
||||
static Suite *create_test_suite()
|
||||
{
|
||||
|
@ -10,6 +11,7 @@ static Suite *create_test_suite()
|
|||
test_case = tcase_create(NULL);
|
||||
|
||||
register_translit_tests(test_case);
|
||||
register_transcript_tests(test_case);
|
||||
suite_add_tcase(suite, test_case);
|
||||
|
||||
return suite;
|
||||
|
|
34
tests/transcript.c
Normal file
34
tests/transcript.c
Normal file
|
@ -0,0 +1,34 @@
|
|||
#include "test.h"
|
||||
#include "transcript.h"
|
||||
#include "../transcription.h"
|
||||
|
||||
START_TEST(test_transcript_devanagari_to_czech)
|
||||
{
|
||||
char *czech;
|
||||
|
||||
czech = transcript_devanagari_to_czech("तन्त्रशास्त्रम्");
|
||||
ck_assert_str_eq("tantrašástra", czech);
|
||||
free(czech);
|
||||
|
||||
czech = transcript_devanagari_to_czech("सांख्य");
|
||||
ck_assert_str_eq("sánkhja", czech);
|
||||
free(czech);
|
||||
|
||||
czech = transcript_devanagari_to_czech("महाभारतम्");
|
||||
ck_assert_str_eq("mahábhárata", czech);
|
||||
free(czech);
|
||||
|
||||
czech = transcript_devanagari_to_czech("योगः");
|
||||
ck_assert_str_eq("jóga", czech);
|
||||
free(czech);
|
||||
|
||||
czech = transcript_devanagari_to_czech("भगवद्गीता");
|
||||
ck_assert_str_eq("bhagavadgíta", czech);
|
||||
free(czech);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
void register_transcript_tests(TCase *test_case)
|
||||
{
|
||||
tcase_add_test(test_case, test_transcript_devanagari_to_czech);
|
||||
}
|
8
tests/transcript.h
Normal file
8
tests/transcript.h
Normal file
|
@ -0,0 +1,8 @@
|
|||
#ifndef __TEST_TRANSCRIPT_H
|
||||
#define __TEST_TRANSCRIPT_H
|
||||
|
||||
#include <check.h>
|
||||
|
||||
void register_transcript_tests(TCase *test_case);
|
||||
|
||||
#endif /* __TEST_TRANSCRIPT_H */
|
135
transcription.c
Normal file
135
transcription.c
Normal file
|
@ -0,0 +1,135 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
|
||||
#include "compat.h"
|
||||
#include "transcription.h"
|
||||
#include "iast-czech.h"
|
||||
#include "utf8.h"
|
||||
|
||||
#define CHUNKSIZE 1024
|
||||
#define SCHWA_CHARACTER 'a'
|
||||
|
||||
static inline int is_consonant(unsigned int c)
|
||||
{
|
||||
return (c >= 0x0915 && c <= 0x0939);
|
||||
}
|
||||
|
||||
static inline int is_nasal(unsigned int c)
|
||||
{
|
||||
return c == 0x0919 || c == 0x091e || c == 0x0923 ||
|
||||
c == 0x0928 || c == 0x092e || c == 0x0902;
|
||||
}
|
||||
|
||||
static void nasal_consonants_filter(char *latin, unsigned int *pos,
|
||||
unsigned int prev, unsigned int c)
|
||||
{
|
||||
char *tail = latin + *pos - 1;
|
||||
|
||||
if (is_nasal(prev)) {
|
||||
/* rewrite nasals before labials to 'm' */
|
||||
switch (c) {
|
||||
case 0x092b: /* ph */
|
||||
case 0x092d: /* bh */
|
||||
case 0x092a: /* p */
|
||||
case 0x092c: /* b */
|
||||
case 0x092e: /* m */
|
||||
*tail = 'm';
|
||||
break;
|
||||
default:
|
||||
if (is_consonant(c) && *tail != SCHWA_CHARACTER)
|
||||
*tail = 'n';
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void end_of_word_filter(char *latin, unsigned int *pos,
|
||||
unsigned int prev, unsigned int c)
|
||||
{
|
||||
unsigned int len;
|
||||
|
||||
if (is_devanagari(prev) && !is_devanagari(c)) {
|
||||
|
||||
/* shorten ending 'á' to 'a' */
|
||||
if (prev == 0x093e) { /* modifier 'á' */
|
||||
*pos = *pos - 1;
|
||||
latin[*pos - 2] = 'a';
|
||||
}
|
||||
|
||||
/* remove singular nominative suffix */
|
||||
len = utf8_char_length(c);
|
||||
if (prev == 0x094d && *(latin + *pos - 1 - len) == 'm') {
|
||||
memmove(latin + *pos - 1 - len, latin + *pos - len, c);
|
||||
*pos = *pos - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static struct translit_letter *letter_by_code(struct translit_letter *table,
|
||||
unsigned int c)
|
||||
{
|
||||
while (table->code != 0) {
|
||||
if (table->code == c)
|
||||
return table;
|
||||
table++;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char *transcript_devanagari_to_czech(const char *devanagari)
|
||||
{
|
||||
struct translit_letter *table, *letter;
|
||||
unsigned int c, prev = 0, alloc = 0, done = 0, len;
|
||||
const char *src = devanagari;
|
||||
char *latin = NULL;
|
||||
|
||||
table = get_iast_czech_transliteration_table();
|
||||
|
||||
while (1) {
|
||||
if (alloc < done + UNICODE_MAX_LENGTH) {
|
||||
latin = realloc(latin, alloc + CHUNKSIZE);
|
||||
alloc += CHUNKSIZE;
|
||||
}
|
||||
|
||||
c = utf8_unpack_char(src);
|
||||
len = utf8_char_length(c);
|
||||
src += len;
|
||||
|
||||
nasal_consonants_filter(latin, &done, prev, c);
|
||||
|
||||
letter = letter_by_code(table, c);
|
||||
if (letter) {
|
||||
switch (letter->type) {
|
||||
case CONSONANT:
|
||||
strcpy(latin + done, letter->data);
|
||||
done += strlen(letter->data);
|
||||
*(latin + done++) = SCHWA_CHARACTER;
|
||||
break;
|
||||
case VOWEL_SIGN:
|
||||
if (done)
|
||||
done--;
|
||||
strcpy(latin + done, letter->data);
|
||||
done += strlen(letter->data);
|
||||
break;
|
||||
default:
|
||||
strcpy(latin + done, letter->data);
|
||||
done += strlen(letter->data);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
utf8_pack_char(latin + done, c);
|
||||
done += len;
|
||||
}
|
||||
|
||||
end_of_word_filter(latin, &done, prev, c);
|
||||
|
||||
|
||||
if (c == 0)
|
||||
break;
|
||||
prev = c;
|
||||
}
|
||||
|
||||
*(latin + done - 1) = '\0';
|
||||
|
||||
return latin;
|
||||
}
|
8
transcription.h
Normal file
8
transcription.h
Normal file
|
@ -0,0 +1,8 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
|
||||
#ifndef __TRANSCRIPTION_H
|
||||
#define __TRANSCRIPTION_H
|
||||
|
||||
char *transcript_devanagari_to_czech(const char *devanagari);
|
||||
|
||||
#endif /* __TRANSCRIPTION_H */
|
Loading…
Reference in a new issue