Prism Ruby parser
|
The encoding interface and implementations used by the parser. More...
#include "prism/defines.h"
#include "prism/util/pm_strncasecmp.h"
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
Go to the source code of this file.
Data Structures | |
struct | pm_encoding_t |
This struct defines the functions necessary to implement the encoding interface so we can determine how many bytes the subsequent character takes. More... | |
Macros | |
#define | PRISM_ENCODING_ALPHABETIC_BIT 1 << 0 |
All of the lookup tables use the first bit of each embedded byte to indicate whether the codepoint is alphabetical. | |
#define | PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1 |
All of the lookup tables use the second bit of each embedded byte to indicate whether the codepoint is alphanumeric. | |
#define | PRISM_ENCODING_UPPERCASE_BIT 1 << 2 |
All of the lookup tables use the third bit of each embedded byte to indicate whether the codepoint is uppercase. | |
#define | PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8]) |
This is the default UTF-8 encoding. | |
#define | PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII]) |
This is the US-ASCII encoding. | |
#define | PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT]) |
This is the ASCII-8BIT encoding. | |
#define | PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP]) |
This is the EUC-JP encoding. | |
#define | PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J]) |
This is the Windows-31J encoding. | |
Enumerations | |
enum | pm_encoding_type_t { PM_ENCODING_UTF_8 = 0 , PM_ENCODING_US_ASCII , PM_ENCODING_ASCII_8BIT , PM_ENCODING_EUC_JP , PM_ENCODING_WINDOWS_31J , PM_ENCODING_BIG5 , PM_ENCODING_BIG5_HKSCS , PM_ENCODING_BIG5_UAO , PM_ENCODING_CESU_8 , PM_ENCODING_CP51932 , PM_ENCODING_CP850 , PM_ENCODING_CP852 , PM_ENCODING_CP855 , PM_ENCODING_CP949 , PM_ENCODING_CP950 , PM_ENCODING_CP951 , PM_ENCODING_EMACS_MULE , PM_ENCODING_EUC_JP_MS , PM_ENCODING_EUC_JIS_2004 , PM_ENCODING_EUC_KR , PM_ENCODING_EUC_TW , PM_ENCODING_GB12345 , PM_ENCODING_GB18030 , PM_ENCODING_GB1988 , PM_ENCODING_GB2312 , PM_ENCODING_GBK , PM_ENCODING_IBM437 , PM_ENCODING_IBM720 , PM_ENCODING_IBM737 , PM_ENCODING_IBM775 , PM_ENCODING_IBM852 , PM_ENCODING_IBM855 , PM_ENCODING_IBM857 , PM_ENCODING_IBM860 , PM_ENCODING_IBM861 , PM_ENCODING_IBM862 , PM_ENCODING_IBM863 , PM_ENCODING_IBM864 , PM_ENCODING_IBM865 , PM_ENCODING_IBM866 , PM_ENCODING_IBM869 , PM_ENCODING_ISO_8859_1 , PM_ENCODING_ISO_8859_2 , PM_ENCODING_ISO_8859_3 , PM_ENCODING_ISO_8859_4 , PM_ENCODING_ISO_8859_5 , PM_ENCODING_ISO_8859_6 , PM_ENCODING_ISO_8859_7 , PM_ENCODING_ISO_8859_8 , PM_ENCODING_ISO_8859_9 , PM_ENCODING_ISO_8859_10 , PM_ENCODING_ISO_8859_11 , PM_ENCODING_ISO_8859_13 , PM_ENCODING_ISO_8859_14 , PM_ENCODING_ISO_8859_15 , PM_ENCODING_ISO_8859_16 , PM_ENCODING_KOI8_R , PM_ENCODING_KOI8_U , PM_ENCODING_MAC_CENT_EURO , PM_ENCODING_MAC_CROATIAN , PM_ENCODING_MAC_CYRILLIC , PM_ENCODING_MAC_GREEK , PM_ENCODING_MAC_ICELAND , PM_ENCODING_MAC_JAPANESE , PM_ENCODING_MAC_ROMAN , PM_ENCODING_MAC_ROMANIA , PM_ENCODING_MAC_THAI , PM_ENCODING_MAC_TURKISH , PM_ENCODING_MAC_UKRAINE , PM_ENCODING_SHIFT_JIS , PM_ENCODING_SJIS_DOCOMO , PM_ENCODING_SJIS_KDDI , PM_ENCODING_SJIS_SOFTBANK , PM_ENCODING_STATELESS_ISO_2022_JP , PM_ENCODING_STATELESS_ISO_2022_JP_KDDI , PM_ENCODING_TIS_620 , PM_ENCODING_UTF8_MAC , PM_ENCODING_UTF8_DOCOMO , PM_ENCODING_UTF8_KDDI , PM_ENCODING_UTF8_SOFTBANK , PM_ENCODING_WINDOWS_1250 , PM_ENCODING_WINDOWS_1251 , PM_ENCODING_WINDOWS_1252 , PM_ENCODING_WINDOWS_1253 , PM_ENCODING_WINDOWS_1254 , PM_ENCODING_WINDOWS_1255 , PM_ENCODING_WINDOWS_1256 , PM_ENCODING_WINDOWS_1257 , PM_ENCODING_WINDOWS_1258 , PM_ENCODING_WINDOWS_874 , PM_ENCODING_MAXIMUM } |
These are all of the encodings that prism supports. | |
Functions | |
size_t | pm_encoding_utf_8_char_width (const uint8_t *b, ptrdiff_t n) |
Return the size of the next character in the UTF-8 encoding. | |
size_t | pm_encoding_utf_8_alpha_char (const uint8_t *b, ptrdiff_t n) |
Return the size of the next character in the UTF-8 encoding if it is an alphabetical character. | |
size_t | pm_encoding_utf_8_alnum_char (const uint8_t *b, ptrdiff_t n) |
Return the size of the next character in the UTF-8 encoding if it is an alphanumeric character. | |
bool | pm_encoding_utf_8_isupper_char (const uint8_t *b, ptrdiff_t n) |
Return true if the next character in the UTF-8 encoding if it is an uppercase character. | |
const pm_encoding_t * | pm_encoding_find (const uint8_t *start, const uint8_t *end) |
Parse the given name of an encoding and return a pointer to the corresponding encoding struct if one can be found, otherwise return NULL. | |
Variables | |
const uint8_t | pm_encoding_unicode_table [256] |
This lookup table is referenced in both the UTF-8 encoding file and the parser directly in order to speed up the default encoding processing. | |
const pm_encoding_t | pm_encodings [PM_ENCODING_MAXIMUM] |
This is the table of all of the encodings that prism supports. | |
The encoding interface and implementations used by the parser.
#define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8]) |
This is the default UTF-8 encoding.
We need a reference to it to quickly create parsers.
#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII]) |
This is the US-ASCII encoding.
We need a reference to it to be able to compare against it when a string is being created because it could possibly need to fall back to ASCII-8BIT.
#define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT]) |
This is the ASCII-8BIT encoding.
We need a reference to it so that pm_strpbrk can compare against it because invalid multibyte characters are not a thing in this encoding. It is also needed for handling Regexp encoding flags.
#define PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP]) |
This is the EUC-JP encoding.
We need a reference to it to quickly process regular expression modifiers.
#define PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J]) |
This is the Windows-31J encoding.
We need a reference to it to quickly process regular expression modifiers.
size_t pm_encoding_utf_8_char_width | ( | const uint8_t * | b, |
ptrdiff_t | n | ||
) |
Return the size of the next character in the UTF-8 encoding.
b | The bytes to read. |
n | The number of bytes that can be read. |
size_t pm_encoding_utf_8_alpha_char | ( | const uint8_t * | b, |
ptrdiff_t | n | ||
) |
Return the size of the next character in the UTF-8 encoding if it is an alphabetical character.
b | The bytes to read. |
n | The number of bytes that can be read. |
size_t pm_encoding_utf_8_alnum_char | ( | const uint8_t * | b, |
ptrdiff_t | n | ||
) |
Return the size of the next character in the UTF-8 encoding if it is an alphanumeric character.
b | The bytes to read. |
n | The number of bytes that can be read. |
bool pm_encoding_utf_8_isupper_char | ( | const uint8_t * | b, |
ptrdiff_t | n | ||
) |
Return true if the next character in the UTF-8 encoding if it is an uppercase character.
b | The bytes to read. |
n | The number of bytes that can be read. |
const pm_encoding_t * pm_encoding_find | ( | const uint8_t * | start, |
const uint8_t * | end | ||
) |
Parse the given name of an encoding and return a pointer to the corresponding encoding struct if one can be found, otherwise return NULL.
start | A pointer to the first byte of the name. |
end | A pointer to the last byte of the name. |
|
extern |
This lookup table is referenced in both the UTF-8 encoding file and the parser directly in order to speed up the default encoding processing.
It is used to indicate whether a character is alphabetical, alphanumeric, or uppercase in unicode mappings.
This lookup table is referenced in both the UTF-8 encoding file and the parser directly in order to speed up the default encoding processing.
Note that this table is different from other encodings where we used a lookup table because the indices of those tables are the byte representations, not the codepoints themselves.