The encoding interface and implementations used by the parser. More...

#include "prism/defines.h"
#include "prism/util/pm_strncasecmp.h"
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>

Include dependency graph for encoding.h:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures
struct	pm_encoding_t
	This struct defines the functions necessary to implement the encoding interface so we can determine how many bytes the subsequent character takes. More...

Macros
#define	PRISM_ENCODING_ALPHABETIC_BIT 1 << 0
	All of the lookup tables use the first bit of each embedded byte to indicate whether the codepoint is alphabetical.

#define	PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
	All of the lookup tables use the second bit of each embedded byte to indicate whether the codepoint is alphanumeric.

#define	PRISM_ENCODING_UPPERCASE_BIT 1 << 2
	All of the lookup tables use the third bit of each embedded byte to indicate whether the codepoint is uppercase.

#define	PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8])
	This is the default UTF-8 encoding.

#define	PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])
	This is the US-ASCII encoding.

#define	PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])
	This is the ASCII-8BIT encoding.

#define	PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP])
	This is the EUC-JP encoding.

#define	PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J])
	This is the Windows-31J encoding.

Enumerations
enum	pm_encoding_type_t { PM_ENCODING_UTF_8 = 0 , PM_ENCODING_US_ASCII , PM_ENCODING_ASCII_8BIT , PM_ENCODING_EUC_JP , PM_ENCODING_WINDOWS_31J , PM_ENCODING_BIG5 , PM_ENCODING_BIG5_HKSCS , PM_ENCODING_BIG5_UAO , PM_ENCODING_CESU_8 , PM_ENCODING_CP51932 , PM_ENCODING_CP850 , PM_ENCODING_CP852 , PM_ENCODING_CP855 , PM_ENCODING_CP949 , PM_ENCODING_CP950 , PM_ENCODING_CP951 , PM_ENCODING_EMACS_MULE , PM_ENCODING_EUC_JP_MS , PM_ENCODING_EUC_JIS_2004 , PM_ENCODING_EUC_KR , PM_ENCODING_EUC_TW , PM_ENCODING_GB12345 , PM_ENCODING_GB18030 , PM_ENCODING_GB1988 , PM_ENCODING_GB2312 , PM_ENCODING_GBK , PM_ENCODING_IBM437 , PM_ENCODING_IBM720 , PM_ENCODING_IBM737 , PM_ENCODING_IBM775 , PM_ENCODING_IBM852 , PM_ENCODING_IBM855 , PM_ENCODING_IBM857 , PM_ENCODING_IBM860 , PM_ENCODING_IBM861 , PM_ENCODING_IBM862 , PM_ENCODING_IBM863 , PM_ENCODING_IBM864 , PM_ENCODING_IBM865 , PM_ENCODING_IBM866 , PM_ENCODING_IBM869 , PM_ENCODING_ISO_8859_1 , PM_ENCODING_ISO_8859_2 , PM_ENCODING_ISO_8859_3 , PM_ENCODING_ISO_8859_4 , PM_ENCODING_ISO_8859_5 , PM_ENCODING_ISO_8859_6 , PM_ENCODING_ISO_8859_7 , PM_ENCODING_ISO_8859_8 , PM_ENCODING_ISO_8859_9 , PM_ENCODING_ISO_8859_10 , PM_ENCODING_ISO_8859_11 , PM_ENCODING_ISO_8859_13 , PM_ENCODING_ISO_8859_14 , PM_ENCODING_ISO_8859_15 , PM_ENCODING_ISO_8859_16 , PM_ENCODING_KOI8_R , PM_ENCODING_KOI8_U , PM_ENCODING_MAC_CENT_EURO , PM_ENCODING_MAC_CROATIAN , PM_ENCODING_MAC_CYRILLIC , PM_ENCODING_MAC_GREEK , PM_ENCODING_MAC_ICELAND , PM_ENCODING_MAC_JAPANESE , PM_ENCODING_MAC_ROMAN , PM_ENCODING_MAC_ROMANIA , PM_ENCODING_MAC_THAI , PM_ENCODING_MAC_TURKISH , PM_ENCODING_MAC_UKRAINE , PM_ENCODING_SHIFT_JIS , PM_ENCODING_SJIS_DOCOMO , PM_ENCODING_SJIS_KDDI , PM_ENCODING_SJIS_SOFTBANK , PM_ENCODING_STATELESS_ISO_2022_JP , PM_ENCODING_STATELESS_ISO_2022_JP_KDDI , PM_ENCODING_TIS_620 , PM_ENCODING_UTF8_MAC , PM_ENCODING_UTF8_DOCOMO , PM_ENCODING_UTF8_KDDI , PM_ENCODING_UTF8_SOFTBANK , PM_ENCODING_WINDOWS_1250 , PM_ENCODING_WINDOWS_1251 , PM_ENCODING_WINDOWS_1252 , PM_ENCODING_WINDOWS_1253 , PM_ENCODING_WINDOWS_1254 , PM_ENCODING_WINDOWS_1255 , PM_ENCODING_WINDOWS_1256 , PM_ENCODING_WINDOWS_1257 , PM_ENCODING_WINDOWS_1258 , PM_ENCODING_WINDOWS_874 , PM_ENCODING_MAXIMUM }
	These are all of the encodings that prism supports.

Functions
size_t	pm_encoding_utf_8_char_width (const uint8_t *b, ptrdiff_t n)
	Return the size of the next character in the UTF-8 encoding.

size_t	pm_encoding_utf_8_alpha_char (const uint8_t *b, ptrdiff_t n)
	Return the size of the next character in the UTF-8 encoding if it is an alphabetical character.

size_t	pm_encoding_utf_8_alnum_char (const uint8_t *b, ptrdiff_t n)
	Return the size of the next character in the UTF-8 encoding if it is an alphanumeric character.

bool	pm_encoding_utf_8_isupper_char (const uint8_t *b, ptrdiff_t n)
	Return true if the next character in the UTF-8 encoding if it is an uppercase character.

const pm_encoding_t *	pm_encoding_find (const uint8_t start, const uint8_t end)
	Parse the given name of an encoding and return a pointer to the corresponding encoding struct if one can be found, otherwise return NULL.

Variables
const uint8_t	pm_encoding_unicode_table [256]
	This lookup table is referenced in both the UTF-8 encoding file and the parser directly in order to speed up the default encoding processing.

const pm_encoding_t	pm_encodings [PM_ENCODING_MAXIMUM]
	This is the table of all of the encodings that prism supports.

Detailed Description

The encoding interface and implementations used by the parser.

Macro Definition Documentation

◆ PM_ENCODING_UTF_8_ENTRY

#define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8])

This is the default UTF-8 encoding.

We need a reference to it to quickly create parsers.

◆ PM_ENCODING_US_ASCII_ENTRY

#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])

This is the US-ASCII encoding.

We need a reference to it to be able to compare against it when a string is being created because it could possibly need to fall back to ASCII-8BIT.

◆ PM_ENCODING_ASCII_8BIT_ENTRY

#define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])

This is the ASCII-8BIT encoding.

We need a reference to it so that pm_strpbrk can compare against it because invalid multibyte characters are not a thing in this encoding. It is also needed for handling Regexp encoding flags.

◆ PM_ENCODING_EUC_JP_ENTRY

#define PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP])

This is the EUC-JP encoding.

We need a reference to it to quickly process regular expression modifiers.

◆ PM_ENCODING_WINDOWS_31J_ENTRY

#define PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J])

This is the Windows-31J encoding.

We need a reference to it to quickly process regular expression modifiers.

Function Documentation

◆ pm_encoding_utf_8_char_width()

size_t pm_encoding_utf_8_char_width	(	const uint8_t *	b,
		ptrdiff_t	n
	)

Return the size of the next character in the UTF-8 encoding.

Parameters

b	The bytes to read.
n	The number of bytes that can be read.

Returns: The number of bytes that the next character takes if it is valid in the encoding, or 0 if it is not.

◆ pm_encoding_utf_8_alpha_char()

size_t pm_encoding_utf_8_alpha_char	(	const uint8_t *	b,
		ptrdiff_t	n
	)

Return the size of the next character in the UTF-8 encoding if it is an alphabetical character.

Parameters

b	The bytes to read.
n	The number of bytes that can be read.

Returns: The number of bytes that the next character takes if it is valid in the encoding, or 0 if it is not.

◆ pm_encoding_utf_8_alnum_char()

size_t pm_encoding_utf_8_alnum_char	(	const uint8_t *	b,
		ptrdiff_t	n
	)

Return the size of the next character in the UTF-8 encoding if it is an alphanumeric character.

Parameters

b	The bytes to read.
n	The number of bytes that can be read.

Returns: The number of bytes that the next character takes if it is valid in the encoding, or 0 if it is not.

◆ pm_encoding_utf_8_isupper_char()

bool pm_encoding_utf_8_isupper_char	(	const uint8_t *	b,
		ptrdiff_t	n
	)

Return true if the next character in the UTF-8 encoding if it is an uppercase character.

Parameters

b	The bytes to read.
n	The number of bytes that can be read.

Returns: True if the next character is valid in the encoding and is an uppercase character, or false if it is not.

◆ pm_encoding_find()

const pm_encoding_t * pm_encoding_find	(	const uint8_t *	start,
		const uint8_t *	end
	)

Parse the given name of an encoding and return a pointer to the corresponding encoding struct if one can be found, otherwise return NULL.

Parameters

start	A pointer to the first byte of the name.
end	A pointer to the last byte of the name.

Returns: A pointer to the encoding struct if one is found, otherwise NULL.

Variable Documentation

◆ pm_encoding_unicode_table

const uint8_t pm_encoding_unicode_table[256]

extern

This lookup table is referenced in both the UTF-8 encoding file and the parser directly in order to speed up the default encoding processing.

It is used to indicate whether a character is alphabetical, alphanumeric, or uppercase in unicode mappings.

This lookup table is referenced in both the UTF-8 encoding file and the parser directly in order to speed up the default encoding processing.

Note that this table is different from other encodings where we used a lookup table because the indices of those tables are the byte representations, not the codepoints themselves.

Data Structures

Macros

Enumerations

Functions

Variables

Detailed Description

Macro Definition Documentation

◆ PM_ENCODING_UTF_8_ENTRY

◆ PM_ENCODING_US_ASCII_ENTRY

◆ PM_ENCODING_ASCII_8BIT_ENTRY

◆ PM_ENCODING_EUC_JP_ENTRY

◆ PM_ENCODING_WINDOWS_31J_ENTRY

Function Documentation

◆ pm_encoding_utf_8_char_width()

◆ pm_encoding_utf_8_alpha_char()

◆ pm_encoding_utf_8_alnum_char()

◆ pm_encoding_utf_8_isupper_char()

◆ pm_encoding_find()

Variable Documentation

◆ pm_encoding_unicode_table