diff --git a/Makefile b/Makefile index 4957dc5..3a65fce 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ .PHONY: all clean all: - $(CC) main.c syllable.c -o main + $(CC) main.c syllable.c utf8.c -o main ./main clean: diff --git a/utf8.c b/utf8.c new file mode 100644 index 0000000..20ca835 --- /dev/null +++ b/utf8.c @@ -0,0 +1,76 @@ +#include +#include "utf8.h" + +unsigned long utf8_unpack_char(char *src) +{ + unsigned long c = 0; + + if ((src[0] & 0x80) == 0x00) { + c = ((src[0] & 0x7f) << 0); + } else if ((src[0] & 0xe0) == 0xc0) { + c = ((src[0] & 0x1f) << 6); + c |= ((src[1] & 0x3f) << 0); + } else if ((src[0] & 0xf0) == 0xe0) { + c = ((src[0] & 0x0f) << 12); + c |= ((src[1] & 0x3f) << 6); + c |= ((src[2] & 0x3f) << 0); + } else if ((src[0] & 0xf8) == 0xf0) { + c = ((src[0] & 0x07) << 18); + c |= ((src[1] & 0x3f) << 12); + c |= ((src[2] & 0x3f) << 6); + c |= ((src[3] & 0x3f) << 0); + } + + return c; +} + +void utf8_pack_char(char *dest, unsigned long c) +{ + if (c <= 0x00007f) { + dest[0] = c; + } else if (c <= 0x0007ff) { + dest[0] = (0xc0 | ((c >> 6) & 0xff)); + dest[1] = (0x80 | ((c >> 0) & 0x3f)); + } else if (c <= 0x00ffff) { + dest[0] = (0xe0 | ((c >> 12) & 0xff)); + dest[1] = (0x80 | ((c >> 6) & 0x3f)); + dest[2] = (0x80 | ((c >> 0) & 0x3f)); + } else if (c <= 0x10ffff) { + dest[0] = (0xf0 | ((c >> 18) & 0xff)); + dest[1] = (0x80 | ((c >> 12) & 0x3f)); + dest[2] = (0x80 | ((c >> 6) & 0x3f)); + dest[3] = (0x80 | ((c >> 0) & 0x3f)); + } else { + dest[0] = '?'; // should not happen + } +} + +unsigned int utf8_char_length(unsigned long c) +{ + if (c <= 0x00007f) { + return 1; + } else if (c <= 0x0007ff) { + return 2; + } else if (c <= 0x00ffff) { + return 3; + } else if (c <= 0x10ffff) { + return 4; + } + + return 0; // should not happen +} + +char *utf8_code_to_string(unsigned long c) +{ + unsigned int length = utf8_char_length(c) + 1; + char *buffer; + + buffer = malloc(length); + if (buffer == NULL) + return NULL; + + utf8_pack_char(buffer, c); + buffer[length] = 0; + + return buffer; +} diff --git a/utf8.h b/utf8.h new file mode 100644 index 0000000..772384d --- /dev/null +++ b/utf8.h @@ -0,0 +1,10 @@ +#ifndef __UTF8_H +#define __UTF8_H + +unsigned long utf8_unpack_char(char *src); +void utf8_pack_char(char *dest, unsigned long c); + +unsigned int utf8_char_length(unsigned long c); +char *utf8_code_to_string(unsigned long c); + +#endif /* __UTF8_H */