1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
/*
 * Copyright (C) 2008 Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * Author(s):
 * 	Behdad Esfahbod
 */

#include <config.h>

#include "bteunistr.h"

#include <string.h>


/* Overview:
 *
 * The way bteunistr is implemented is very simple: Unicode only defines
 * codepoints less than 0x110000.  That leaves plenty of room in a guint32 to
 * use for other things.  So, whenever our "string" contains only one Unicode
 * character, we use its code as our bteunistr.  Otherwise, we register the
 * string in a central registry and assign a unique number to it and use that.
 * This number is "our own private internal non-unicode code for this
 * sequence of characters".
 *
 * The rest of the problem would be how to efficiently implement this
 * registry.  It does *NOT* really have to be efficient at all, as it will
 * only be accessed in case of combining marks.  And the strings are pretty
 * short (two or three characters).  But our implementation is quite efficient
 * anyway.
 *
 * The access pattern of using bteunistr's is that we have a bteunistr in a
 * terminal cell, a new gunichar comes in and we decide to combine with it,
 * and we combine them and get a new bteunistr.  So, that is exactly how we
 * encode bteunistr's: all we need to know about a bteunistr to be able to
 * reconstruct its string is the bteunistr and the gunichar that joined to
 * form it.  That's what BteUnistrDecomp is.  That is the decomposition.
 *
 * We start giving new bteunistr's unique numbers starting at
 * %BTE_UNISTR_START+1 and going up.  We keep the decompositions in a GArray,
 * called unistr_decomp.  The first entry of the array is unused (that's why
 * we start from %BTE_UNISTR_START plus one).  The decomposition table provides
 * enough information to efficiently answer questions like "what's the first
 * gunichar in this bteunistr?", "what's the sequence of gunichar's in this
 * bteunistr?", and "how many gunichar's are there in this bteunistr?".
 *
 * We still do not have any efficient way to construct new bteunistr's though.
 * Given a bteunistr and a gunichar, we have to walk over the entire
 * decomposition table to see if we have already registered (encoded) this
 * combination.  To make that operation fast, we use a reverse map, that is,
 * a GHashTable named unistr_comp.  The hash table maps a decomposition to its
 * encoded bteunistr value.  The value obviously fits in a pointer and does
 * not need memory allocation.  We also want to avoid allocating memory for
 * the key of the hash table entries, as we already have those decompositions
 * in the memory in the unistr_decomp array.  We cannot use direct pointers
 * though as when growing, the GArray may resize and move to a new memory
 * buffer, rendering all our pointers invalid.  For this reason, we keep the
 * index into the array as our hash table key.  When we want to perform a
 * lookup in the hash table, we insert the decomposition that we are searching
 * for as item zero in the unistr_decomp table, then lookup for an equal entry
 * of it in the hash table.  Finally, if the hash lookup fails, we add the new
 * decomposition to the lookup array and the hash table, and return the newly
 * encoded bteunistr value.
 */

#define BTE_UNISTR_START 0x80000000

static bteunistr unistr_next = BTE_UNISTR_START + 1;

struct BteUnistrDecomp {
	bteunistr prefix;
	gunichar  suffix;
};

GArray     *unistr_decomp;
GHashTable *unistr_comp;

#define DECOMP_FROM_INDEX(i)	g_array_index (unistr_decomp, struct BteUnistrDecomp, (i))
#define DECOMP_FROM_UNISTR(s)	DECOMP_FROM_INDEX ((s) - BTE_UNISTR_START)

static guint
unistr_comp_hash (gconstpointer key)
{
	struct BteUnistrDecomp *decomp;
	decomp = &DECOMP_FROM_INDEX (GPOINTER_TO_UINT (key));
	return decomp->prefix ^ decomp->suffix;
}

static gboolean
unistr_comp_equal (gconstpointer a, gconstpointer b)
{
	return 0 == memcmp (&DECOMP_FROM_INDEX (GPOINTER_TO_UINT (a)),
			    &DECOMP_FROM_INDEX (GPOINTER_TO_UINT (b)),
			    sizeof (struct BteUnistrDecomp));
}

bteunistr
_bte_unistr_append_unichar (bteunistr s, gunichar c)
{
	struct BteUnistrDecomp decomp;
	bteunistr ret = 0;

	decomp.prefix = s;
	decomp.suffix = c;

	if (G_UNLIKELY (!unistr_decomp)) {
		unistr_decomp = g_array_new (FALSE, TRUE, sizeof (struct BteUnistrDecomp));
		g_array_set_size (unistr_decomp, 1);
		unistr_comp = g_hash_table_new (unistr_comp_hash, unistr_comp_equal);
	} else {
		DECOMP_FROM_INDEX (0) = decomp;
		ret = GPOINTER_TO_UINT (g_hash_table_lookup (unistr_comp, GUINT_TO_POINTER (0)));
	}

	if (G_UNLIKELY (!ret)) {
		/* sanity check to avoid OOM */
		if (G_UNLIKELY (_bte_unistr_strlen (s) > 10 || unistr_next - BTE_UNISTR_START > 100000))
			return s;

		ret = unistr_next++;
		g_array_append_val (unistr_decomp, decomp);
		g_hash_table_insert (unistr_comp,
				     GUINT_TO_POINTER (ret - BTE_UNISTR_START),
				     GUINT_TO_POINTER (ret));
	}

	return ret;
}

bteunistr
_bte_unistr_append_unistr (bteunistr s, bteunistr t)
{
        g_return_val_if_fail (s < unistr_next, s);
        g_return_val_if_fail (t < unistr_next, s);
        if (G_UNLIKELY (t >= BTE_UNISTR_START)) {
                s = _bte_unistr_append_unistr (s, DECOMP_FROM_UNISTR (t).prefix);
                return _bte_unistr_append_unichar (s, DECOMP_FROM_UNISTR (t).suffix);
        } else {
                return _bte_unistr_append_unichar (s, t);
        }
}

gunichar
_bte_unistr_get_base (bteunistr s)
{
	g_return_val_if_fail (s < unistr_next, s);
	while (G_UNLIKELY (s >= BTE_UNISTR_START))
		s = DECOMP_FROM_UNISTR (s).prefix;
	return (gunichar) s;
}

void
_bte_unistr_append_to_gunichars (bteunistr s, GArray *a)
{
        if (G_UNLIKELY (s >= BTE_UNISTR_START)) {
                struct BteUnistrDecomp *decomp;
                decomp = &DECOMP_FROM_UNISTR (s);
                _bte_unistr_append_to_gunichars (decomp->prefix, a);
                s = decomp->suffix;
        }
        gunichar val = (gunichar) s;
        g_array_append_val (a, val);
}

bteunistr
_bte_unistr_replace_base (bteunistr s, gunichar c)
{
        g_return_val_if_fail (s < unistr_next, s);

        if (G_LIKELY (_bte_unistr_get_base(s) == c))
                return s;

        GArray *a = g_array_new (FALSE, FALSE, sizeof (gunichar));
        _bte_unistr_append_to_gunichars (s, a);
        g_assert_cmpint(a->len, >=, 1);<--- syntax error

        s = c;
        for (glong i = 1; i < a->len; i++)
                s = _bte_unistr_append_unichar (s, g_array_index (a, gunichar, i));

        g_array_free (a, TRUE);
        return s;
}

void
_bte_unistr_append_to_string (bteunistr s, GString *gs)
{
	g_return_if_fail (s < unistr_next);
	if (G_UNLIKELY (s >= BTE_UNISTR_START)) {
		struct BteUnistrDecomp *decomp;
		decomp = &DECOMP_FROM_UNISTR (s);
		_bte_unistr_append_to_string (decomp->prefix, gs);
		s = decomp->suffix;
	}
	g_string_append_unichar (gs, (gunichar) s);
}

int
_bte_unistr_strlen (bteunistr s)
{
	int len = 1;
	g_return_val_if_fail (s < unistr_next, len);
	while (G_UNLIKELY (s >= BTE_UNISTR_START)) {
		s = DECOMP_FROM_UNISTR (s).prefix;
		len++;
	}
	return len;
}