Logo Search packages:      
Sourcecode: pango-graphite version File versions  Download package

silgraphite-break.c

/* Pango
 * silgraphite-fc.c: SILGraphite based shaper for FreeType-based backends
 *
 * Copyright (C) 2004-2005 SIL International
 * Author: Daniel Glassey <wdg@debian.org>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.      See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

#include <gmodule.h>
#include <pango/pango-engine.h>
#include <pango/pango-utils.h>
#include <pango/pangofc-font.h>
#include <pango/pangoxft.h>
#include "pangographite.h"

static gboolean have_graphite = FALSE;
static gboolean graphite_initialised = FALSE;

typedef void (*pgraphite_real_break) (const char      *text,
                        int               length,
                  PangoFont   *font, 
                  PangoLogAttr      *attrs,
                  int               attrs_len);

static pgraphite_real_break graphite_real_break = NULL;
static GModule *graphitelibrary = NULL;

/* No extra fields needed */
typedef PangoEngineLang      GraphiteEngineBreak;
typedef PangoEngineLangClass GraphiteEngineBreakClass;

#define LANG_ENGINE_NAME "GraphiteScriptEngineBreak"
#define RENDER_TYPE PANGO_RENDER_TYPE_NONE


static PangoEngineScriptInfo graphite_scripts[] = {
  {    PANGO_SCRIPT_COMMON,   "*" },             /* Zyyy */
  {    PANGO_SCRIPT_INHERITED,      "*" },          /* Qaai */
  {    PANGO_SCRIPT_ARABIC,   "*" },             /* Arab */
  {    PANGO_SCRIPT_ARMENIAN, "*" },           /* Armn */
  {    PANGO_SCRIPT_BENGALI,  "*" },            /* Beng */
  {    PANGO_SCRIPT_BOPOMOFO, "*" },           /* Bopo */
  {    PANGO_SCRIPT_CHEROKEE, "*" },           /* Cher */
  {    PANGO_SCRIPT_COPTIC,   "*" },             /* Qaac */
  {    PANGO_SCRIPT_CYRILLIC, "*" },           /* Cyrl (Cyrs) */
  {    PANGO_SCRIPT_DESERET,  "*" },            /* Dsrt */
  {    PANGO_SCRIPT_DEVANAGARI,     "*" },         /* Deva */
  {    PANGO_SCRIPT_ETHIOPIC, "*" },           /* Ethi */
  {    PANGO_SCRIPT_GEORGIAN, "*" },           /* Geor (Geon, Geoa) */
  {    PANGO_SCRIPT_GOTHIC,   "*" },             /* Goth */
  {    PANGO_SCRIPT_GREEK,    "*" },              /* Grek */
  {    PANGO_SCRIPT_GUJARATI, "*" },           /* Gujr */
  {    PANGO_SCRIPT_GURMUKHI, "*" },           /* Guru */
  {    PANGO_SCRIPT_HAN,      "*" },                /* Hani */
  {    PANGO_SCRIPT_HANGUL,   "*" },             /* Hang */
  {    PANGO_SCRIPT_HEBREW,   "*" },             /* Hebr */
  {    PANGO_SCRIPT_HIRAGANA, "*" },           /* Hira */
  {    PANGO_SCRIPT_KANNADA,  "*" },            /* Knda */
  {    PANGO_SCRIPT_KATAKANA, "*" },           /* Kana */
  {    PANGO_SCRIPT_KHMER,    "*" },              /* Khmr */
  {    PANGO_SCRIPT_LAO,      "*" },                /* Laoo */
  {    PANGO_SCRIPT_LATIN,    "*" },              /* Latn (Latf, Latg) */
  {    PANGO_SCRIPT_MALAYALAM,      "*" },          /* Mlym */
  {    PANGO_SCRIPT_MONGOLIAN,      "*" },          /* Mong */
  {    PANGO_SCRIPT_MYANMAR,  "*" },            /* Mymr */
  {    PANGO_SCRIPT_OGHAM,    "*" },              /* Ogam */
  {    PANGO_SCRIPT_OLD_ITALIC,     "*" },         /* Ital */
  {    PANGO_SCRIPT_ORIYA,    "*" },              /* Orya */
  {    PANGO_SCRIPT_RUNIC,    "*" },              /* Runr */
  {    PANGO_SCRIPT_SINHALA,  "*" },            /* Sinh */
  {    PANGO_SCRIPT_SYRIAC,   "*" },             /* Syrc (Syrj, Syrn, Syre) */
  {    PANGO_SCRIPT_TAMIL,    "*" },              /* Taml */
  {    PANGO_SCRIPT_TELUGU,   "*" },             /* Telu */
  {    PANGO_SCRIPT_THAANA,   "*" },             /* Thaa */
  {    PANGO_SCRIPT_THAI,     "*" },               /* Thai */
  {    PANGO_SCRIPT_TIBETAN,  "*" },            /* Tibt */
  {    PANGO_SCRIPT_CANADIAN_ABORIGINAL,  "*" }, /* Cans */
  {    PANGO_SCRIPT_YI,       "*" },                 /* Yiii */
  {    PANGO_SCRIPT_TAGALOG,  "*" },            /* Tglg */
  {    PANGO_SCRIPT_HANUNOO,  "*" },            /* Hano */
  {    PANGO_SCRIPT_BUHID,    "*" },              /* Buhd */
  {    PANGO_SCRIPT_TAGBANWA, "*" },           /* Tagb */
  
      /* Unicode-4.0 additions */
  {    PANGO_SCRIPT_BRAILLE,  "*" },            /* Brai */
  {    PANGO_SCRIPT_CYPRIOT,  "*" },            /* Cprt */
  {    PANGO_SCRIPT_LIMBU,    "*" },              /* Limb */
  {    PANGO_SCRIPT_OSMANYA,  "*" },            /* Osma */
  {    PANGO_SCRIPT_SHAVIAN,  "*" },            /* Shaw */
  {    PANGO_SCRIPT_LINEAR_B, "*" },           /* Linb */
  {    PANGO_SCRIPT_TAI_LE,   "*" },             /* Tale */
  {    PANGO_SCRIPT_UGARITIC, "*" }            /* Ugar */
#if 0
      /* Unicode-4.1 additions  (requires pango >= 1.9.1) */
  ,{    PANGO_SCRIPT_NEW_TAI_LUE,   "*" },            /* Talu */
  {    PANGO_SCRIPT_BUGINESE, "*" },            /* Bugi */
  {    PANGO_SCRIPT_GLAGOLITIC,     "*" },            /* Glag */
  {    PANGO_SCRIPT_TIFINAGH, "*" },            /* Tfng */
  {    PANGO_SCRIPT_SYLOTI_NAGRI,   "*" },            /* Sylo */
  {    PANGO_SCRIPT_OLD_PERSIAN,    "*" },            /* Xpeo */
  {    PANGO_SCRIPT_KHAROSHTHI,     "*" }            /* Khar */
#endif
};

static PangoEngineInfo script_engines[] = {
  {
    LANG_ENGINE_NAME,
    PANGO_ENGINE_TYPE_LANG,
    RENDER_TYPE,
    graphite_scripts, G_N_ELEMENTS(graphite_scripts)
  }
};



typedef enum
{
  BREAK_ALREADY_HANDLED,   /* didn't use the table */
  BREAK_PROHIBITED, /* no break, even if spaces intervene */
  BREAK_IF_SPACES,  /* "indirect break" (only if there are spaces) */
  BREAK_ALLOWED     /* "direct break" (can always break here) */
  /* TR 14 has one more break-opportunity class,
   * "indirect break opportunity for combining marks following a space"
   * but we handle that inline in the code.
   */
} BreakOpportunity;

enum
{
  INDEX_OPEN_PUNCTUATION,
  INDEX_CLOSE_PUNCTUATION,
  INDEX_QUOTATION,
  INDEX_NON_BREAKING_GLUE,
  INDEX_NON_STARTER,
  INDEX_EXCLAMATION,
  INDEX_SYMBOL,
  INDEX_INFIX_SEPARATOR,
  INDEX_PREFIX,
  INDEX_POSTFIX,
  INDEX_NUMERIC,
  INDEX_ALPHABETIC,
  INDEX_IDEOGRAPHIC,
  INDEX_INSEPARABLE,
  INDEX_HYPHEN,
  INDEX_AFTER,
  INDEX_BEFORE,
  INDEX_BEFORE_AND_AFTER,
  INDEX_ZERO_WIDTH_SPACE,
  INDEX_COMBINING_MARK,
  INDEX_WORD_JOINER,

  /* End of the table */

  INDEX_END_OF_TABLE,

  /* The following are not in the tables */
  INDEX_MANDATORY,
  INDEX_CARRIAGE_RETURN,
  INDEX_LINE_FEED,
  INDEX_SURROGATE,
  INDEX_CONTINGENT,
  INDEX_SPACE,
  INDEX_COMPLEX_CONTEXT,
  INDEX_AMBIGUOUS,
  INDEX_UNKNOWN,
  INDEX_NEXT_LINE
};

static const BreakOpportunity row_OPEN_PUNCTUATION[INDEX_END_OF_TABLE] = {
  BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_CLOSE_PUNCTUATION[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_QUOTATION[INDEX_END_OF_TABLE] = {
  BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_NON_BREAKING_GLUE[INDEX_END_OF_TABLE] = {
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_NON_STARTER[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_EXCLAMATION[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_SYMBOL[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_INFIX_SEPARATOR[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_PREFIX[INDEX_END_OF_TABLE] = {
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_POSTFIX[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_NUMERIC[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_ALPHABETIC[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_IDEOGRAPHIC[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED, BREAK_ALLOWED,
  BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_INSEPARABLE[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
  BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_HYPHEN[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_ALLOWED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_AFTER[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_BEFORE[INDEX_END_OF_TABLE] = {
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_BEFORE_AND_AFTER[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_ZERO_WIDTH_SPACE[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED, BREAK_ALLOWED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED
};

static const BreakOpportunity row_COMBINING_MARK[INDEX_END_OF_TABLE] = {
  BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_ALLOWED, BREAK_ALLOWED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity row_WORD_JOINER[INDEX_END_OF_TABLE] = {
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_IF_SPACES,
  BREAK_IF_SPACES, BREAK_IF_SPACES, BREAK_PROHIBITED, BREAK_PROHIBITED,
  BREAK_PROHIBITED
};

static const BreakOpportunity *const line_break_rows[INDEX_END_OF_TABLE] = {
  row_OPEN_PUNCTUATION, /* INDEX_OPEN_PUNCTUATION */
  row_CLOSE_PUNCTUATION, /* INDEX_CLOSE_PUNCTUATION */
  row_QUOTATION, /* INDEX_QUOTATION */
  row_NON_BREAKING_GLUE, /* INDEX_NON_BREAKING_GLUE */
  row_NON_STARTER, /* INDEX_NON_STARTER */
  row_EXCLAMATION, /* INDEX_EXCLAMATION */
  row_SYMBOL, /* INDEX_SYMBOL */
  row_INFIX_SEPARATOR, /* INDEX_INFIX_SEPARATOR */
  row_PREFIX, /* INDEX_PREFIX */
  row_POSTFIX, /* INDEX_POSTFIX */
  row_NUMERIC, /* INDEX_NUMERIC */
  row_ALPHABETIC, /* INDEX_ALPHABETIC */
  row_IDEOGRAPHIC, /* INDEX_IDEOGRAPHIC */
  row_INSEPARABLE, /* INDEX_INSEPARABLE */
  row_HYPHEN, /* INDEX_HYPHEN */
  row_AFTER, /* INDEX_AFTER */
  row_BEFORE, /* INDEX_BEFORE */
  row_BEFORE_AND_AFTER, /* INDEX_BEFORE_AND_AFTER */
  row_ZERO_WIDTH_SPACE, /* INDEX_ZERO_WIDTH_SPACE */
  row_COMBINING_MARK, /* INDEX_COMBINING_MARK */
  row_WORD_JOINER /* INDEX_WORD_JOINER */
};

/* Map GUnicodeBreakType to table indexes */
static const int line_break_indexes[] = {
  INDEX_MANDATORY,
  INDEX_CARRIAGE_RETURN,
  INDEX_LINE_FEED,
  INDEX_COMBINING_MARK,
  INDEX_SURROGATE,
  INDEX_ZERO_WIDTH_SPACE,
  INDEX_INSEPARABLE,
  INDEX_NON_BREAKING_GLUE,
  INDEX_CONTINGENT,
  INDEX_SPACE,
  INDEX_AFTER,
  INDEX_BEFORE,
  INDEX_BEFORE_AND_AFTER,
  INDEX_HYPHEN,
  INDEX_NON_STARTER,
  INDEX_OPEN_PUNCTUATION,
  INDEX_CLOSE_PUNCTUATION,
  INDEX_QUOTATION,
  INDEX_EXCLAMATION,
  INDEX_IDEOGRAPHIC,
  INDEX_NUMERIC,
  INDEX_INFIX_SEPARATOR,
  INDEX_SYMBOL,
  INDEX_ALPHABETIC,
  INDEX_PREFIX,
  INDEX_POSTFIX,
  INDEX_COMPLEX_CONTEXT,
  INDEX_AMBIGUOUS,
  INDEX_UNKNOWN,
  INDEX_NEXT_LINE,
  INDEX_WORD_JOINER
};

#define BREAK_TYPE_SAFE(btype)            \
         (btype < G_N_ELEMENTS(line_break_indexes) ? btype : G_UNICODE_BREAK_UNKNOWN)
#define BREAK_INDEX(btype)                \
         (line_break_indexes[(btype)])
#define BREAK_ROW(before_type)            \
         (line_break_rows[BREAK_INDEX (before_type)])
#define BREAK_OP(before_type, after_type) \
         (BREAK_ROW (before_type)[BREAK_INDEX (after_type)])
#define IN_BREAK_TABLE(btype)             \
         (btype < G_N_ELEMENTS(line_break_indexes) && BREAK_INDEX(btype) < INDEX_END_OF_TABLE)

/* Keep these in sync with the same macros in the test program */

#define LEADING_JAMO(wc)  ((wc) >= 0x1100 && (wc) <= 0x115F)
#define VOWEL_JAMO(wc)    ((wc) >= 0x1160 && (wc) <= 0x11A2)
#define TRAILING_JAMO(wc) ((wc) >= 0x11A8 && (wc) <= 0x11F9)
#define JAMO(wc)          ((wc) >= 0x1100 && (wc) <= 0x11FF)
/* "virama script" is just an optimization; it includes a bunch of
 * scripts without viramas in them
 */
#define VIRAMA_SCRIPT(wc)        ((wc) >= 0x0901 && (wc) <= 0x17FF)
#define VIRAMA(wc) ((wc) == 0x094D || \
                    (wc) == 0x09CD || \
                    (wc) == 0x0A4D || \
                    (wc) == 0x0ACD || \
                    (wc) == 0x0B4D || \
                    (wc) == 0x0BCD || \
                    (wc) == 0x0C4D || \
                    (wc) == 0x0CCD || \
                    (wc) == 0x0D4D || \
                    (wc) == 0x0DCA || \
                    (wc) == 0x0E3A || \
                    (wc) == 0x0F84 || \
                    (wc) == 0x1039 || \
                    (wc) == 0x17D2)
/* Types of Japanese characters */
#define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
#define KANJI(wc)    ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
#define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)
#define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)

#define LATIN(wc) (((wc) >= 0x0020 && (wc) <= 0x02AF) || ((wc) >= 0x1E00 && (wc) <= 0x1EFF))
#define CYRILLIC(wc) (((wc) >= 0x0400 && (wc) <= 0x052F))
#define GREEK(wc) (((wc) >= 0x0370 && (wc) <= 0x3FF) || ((wc) >= 0x1F00 && (wc) <= 0x1FFF))
#define KANA(wc) ((wc) >= 0x3040 && (wc) <= 0x30FF)
#define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)
#define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) && !HANGUL(wc))


/* p. 132-133 of Unicode spec table 5-6 will help understand this */
typedef enum
{
  STATE_SENTENCE_OUTSIDE,
  STATE_SENTENCE_BODY,
  STATE_SENTENCE_TERM,
  STATE_SENTENCE_POST_TERM_CLOSE,
  STATE_SENTENCE_POST_TERM_SPACE,
  STATE_SENTENCE_POST_TERM_SEP,
  STATE_SENTENCE_DOT,
  STATE_SENTENCE_POST_DOT_CLOSE,
  STATE_SENTENCE_POST_DOT_SPACE,
  STATE_SENTENCE_POST_DOT_OPEN,
  /* never include line/para separators in a sentence for now */
  /* This isn't in the spec, but I can't figure out why they'd include
   * one line/para separator in lines ending with Term but not with
   * period-terminated lines, so I'm doing it for the dot lines also
   */
  STATE_SENTENCE_POST_DOT_SEP
} SentenceState;

/* We call "123" and "foobar" words, but "123foo" is two words;
 * the Unicode spec just calls "123" a non-word
 */
typedef enum
{
  WordNone,
  WordLetters,
  WordNumbers
} WordType;


/**
 * pango_default_break:
 * @text: text to break
 * @length: length of text in bytes (may be -1 if @text is nul-terminated)
 * @analysis: a #PangoAnalysis for the @text
 * @attrs: logical attributes to fill in
 * @attrs_len: size of the array passed as @attrs
 *
 * This is the default break algorithm, used if no language
 * engine overrides it. Normally you should use pango_break()
 * instead; this function is mostly useful for chaining up
 * from a language engine override. Unlike pango_break(),
 * @analysis can be NULL, but only do that if you know what
 * you're doing. (If you need an analysis to pass to pango_break(),
 * you need to pango_itemize() or use pango_get_log_attrs().)
 *
 **/
void
pango_default_break_for_graphite (const gchar   *text,
                     gint           length,
                     PangoAnalysis *analysis,
                     PangoLogAttr  *attrs,
                     int            attrs_len)
{
  /* The rationale for all this is in section 5.15 of the Unicode 3.0 book,
   * the line breaking stuff is also in TR14 on unicode.org
   */

  /* This is a default break implementation that should work for nearly all
   * languages. Language engines can override it optionally.
   */

  /* FIXME one cheesy optimization here would be to memset attrs to 0
   * before we start, and then never assign FALSE to anything
   */

    const gchar *next;
    gint i;
    gunichar prev_wc;
    gunichar next_wc;
    GUnicodeBreakType next_break_type;
    GUnicodeType prev_type;
    GUnicodeBreakType prev_break_type; /* skips spaces */
    gboolean prev_was_break_space;
    WordType current_word_type = WordNone;
    gunichar last_word_letter = 0;
    gunichar base_character = 0;
    SentenceState sentence_state = STATE_SENTENCE_OUTSIDE;
    /* Tracks what will be the end of the sentence if a period is
    * determined to actually be a sentence-ending period.
    */
    gint possible_sentence_end = -1;
    /* possible sentence break before Open* after a period-ended sentence */
    gint possible_sentence_boundary = -1;
    gint n_chars;
    
    g_return_if_fail (text != NULL);
    g_return_if_fail (attrs != NULL);
    
    n_chars = g_utf8_strlen (text, length);
    
    next = text;
    
    /* + 1 because of the extra newline we stick on the end */
    if (attrs_len < n_chars + 1)
    {
        g_warning ("pango_default_break(): the array of PangoLogAttr passed in must have at least N+1 elements, if there are N characters in the text being broken");
        return;
    }
      
    prev_type = (GUnicodeType) -1;
    prev_break_type = G_UNICODE_BREAK_UNKNOWN;
    prev_was_break_space = FALSE;
    prev_wc = 0;

    if (n_chars)
    {
        next_wc = g_utf8_get_char (next);
        g_assert (next_wc != 0);
    }
    else
        next_wc = '\n';

    next_break_type = g_unichar_break_type (next_wc);
    next_break_type = BREAK_TYPE_SAFE (next_break_type);

#if GR_LB // calculate is_cursor_position the graphite way
    static GStaticMutex mutex = G_STATIC_MUTEX_INIT;

    g_static_mutex_lock (&mutex);
    // Do Graphite Breaking
    graphite_real_break(text, length, analysis->font, attrs, attrs_len);
    g_static_mutex_unlock (&mutex);
#endif
    for (i = 0; i <= n_chars; i++)
    {
        GUnicodeType type;
        gunichar wc;
        GUnicodeBreakType break_type;
        //BreakOpportunity break_op;

        wc = next_wc;
        break_type = next_break_type;

        if (i == n_chars)
        {
          /*
           * If we have already reached the end of @text g_utf8_next_char()
           * may not increment next
           */
            next_wc = 0;
              next_break_type = G_UNICODE_BREAK_UNKNOWN;
        }
        else
        {
            next = g_utf8_next_char (next);

            if (i == n_chars - 1)
            {
              /* This is how we fill in the last element (end position) of the
               * attr array - assume there's a newline off the end of @text.
               */
                next_wc = '\n';
            }
            else
            {
                next_wc = g_utf8_get_char (next);
                g_assert (next_wc != 0);
            }
        
              next_break_type = g_unichar_break_type (next_wc);
            next_break_type = BREAK_TYPE_SAFE (next_break_type);
        }

        type = g_unichar_type (wc);
#if !GR_LB

      /* Can't just use the type here since isspace() doesn't
       * correspond to a Unicode character type
       */
        attrs[i].is_white = g_unichar_isspace (wc);
#endif
      /* ---- Cursor position breaks (Grapheme breaks) ---- */

        if (wc == '\n')
        {
          /* Break before line feed unless prev char is a CR */

            if (prev_wc != '\r')
                attrs[i].is_cursor_position = TRUE;
            else
                attrs[i].is_cursor_position = FALSE;
        }
        else if (i == 0 ||
               prev_type == G_UNICODE_CONTROL ||
               prev_type == G_UNICODE_FORMAT)
        {
          /* Break at first position (must be special cased, or if the
           * first char is say a combining mark there won't be a
           * cursor position at the start, which seems wrong to me
           * ???? - maybe it makes sense though, who knows)
           */
          /* break after all format or control characters */
            attrs[i].is_cursor_position = TRUE;
        }
#if !GR_LB
        else
        {
            switch (type)
            {
                case G_UNICODE_CONTROL:
                case G_UNICODE_FORMAT:
              /* Do not break before any format or control characters */
                    attrs[i].is_cursor_position = FALSE;
                    break;

                case G_UNICODE_COMBINING_MARK:
                case G_UNICODE_ENCLOSING_MARK:
                case G_UNICODE_NON_SPACING_MARK:
              /* Unicode spec includes "Combining marks plus Tibetan
               * subjoined characters" as joining chars, but lists the
               * Tibetan subjoined characters as combining marks, and
               * g_unichar_type() returns NON_SPACING_MARK for the Tibetan
               * subjoined characters. So who knows, beats me.
               */

              /* It's a joining character, break only if preceded by
               * control or format; we already handled the case where
               * it was preceded earlier, so here we know it wasn't,
               * don't break
               */
                    attrs[i].is_cursor_position = FALSE;
                    break;

                case G_UNICODE_LOWERCASE_LETTER:
                case G_UNICODE_MODIFIER_LETTER:
                case G_UNICODE_OTHER_LETTER:
                case G_UNICODE_TITLECASE_LETTER:
                case G_UNICODE_UPPERCASE_LETTER:
                    if (JAMO (wc))
                    {
                  /* Break before Jamo if they are in a broken sequence or
                   * next to non-Jamo, otherwise don't
                   */
                        if (LEADING_JAMO (wc) &&
                                !LEADING_JAMO (prev_wc))
                            attrs[i].is_cursor_position = TRUE;
                        else if (VOWEL_JAMO (wc) &&
                                !LEADING_JAMO (prev_wc) &&
                                !VOWEL_JAMO (prev_wc))
                            attrs[i].is_cursor_position = TRUE;
                        else if (TRAILING_JAMO (wc) &&
                                !LEADING_JAMO (prev_wc) &&
                                !VOWEL_JAMO (prev_wc) &&
                                !TRAILING_JAMO (prev_wc))
                            attrs[i].is_cursor_position = TRUE;
                        else
                            attrs[i].is_cursor_position = FALSE;
                    }
                    else
                    {
                  /* Handle non-Jamo non-combining chars */

                  /* Break if preceded by Jamo; don't break if a
                   * letter is preceded by a virama; break in all
                   * other cases. No need to check whether we're
                   * preceded by Jamo explicitly, since a Jamo is not
                   * a virama, we just break in all cases where we
                   * aren't preceded by a virama. Don't fool with viramas
                   * if we aren't part of a script that uses them.
                   */

                        if (VIRAMA_SCRIPT (wc))
                        {
                      /* Check whether we're preceded by a virama; this
                       * could use some optimization.
                       */
                            if (VIRAMA (prev_wc))
                                attrs[i].is_cursor_position = FALSE;
                            else
                                attrs[i].is_cursor_position = TRUE;
                        }
                        else
                        {
                            attrs[i].is_cursor_position = TRUE;
                        }
                    }
                    break;

                default:
              /* Some weirdo char, just break here, why not */
                    attrs[i].is_cursor_position = TRUE;
                    break;
            }
        }

// can this be done with graphite?
      /* If this is a grapheme boundary, we have to decide if backspace
       * deletes a character or the whole grapheme cluster */
      if (attrs[i].is_cursor_position)
        attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);
      else
        attrs[i].backspace_deletes_character = FALSE;

// Leave line, word and sentence breaking alone for now
      /* ---- Line breaking ---- */

      break_op = BREAK_ALREADY_HANDLED;

      g_assert (prev_break_type != G_UNICODE_BREAK_SPACE);

      attrs[i].is_line_break = FALSE;
      attrs[i].is_mandatory_break = FALSE;
      
      if (attrs[i].is_cursor_position) /* If it's not a grapheme boundary,
                                        * it's not a line break either
                                        */
        {
        /* space followed by a combining mark is handled
         * specially; (rule 7a from TR 14)
         */
        if (break_type == G_UNICODE_BREAK_SPACE &&
            next_break_type == G_UNICODE_BREAK_COMBINING_MARK)
          break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
            
          /* Unicode doesn't specify char wrap; we wrap around all chars
           * except where a line break is prohibited, which means we
           * effectively break everywhere except inside runs of spaces.
           */
          attrs[i].is_char_break = TRUE;          
          
          switch (prev_break_type)
            {
            case G_UNICODE_BREAK_MANDATORY:
            case G_UNICODE_BREAK_LINE_FEED:
            case G_UNICODE_BREAK_NEXT_LINE:
              attrs[i].is_line_break = TRUE;
              attrs[i].is_mandatory_break = TRUE;
              break;

            case G_UNICODE_BREAK_CARRIAGE_RETURN:
              if (wc != '\n')
                {
                  attrs[i].is_line_break = TRUE;
                  attrs[i].is_mandatory_break = TRUE;
                }
              break;

            case G_UNICODE_BREAK_CONTINGENT:
              /* can break after 0xFFFC by default, though we might want
               * to eventually have a PangoLayout setting or
               * PangoAttribute that disables this, if for some
               * application breaking after objects is not desired.
               */
              break_op = BREAK_ALLOWED;
              break;

            case G_UNICODE_BREAK_SURROGATE:
            g_assert_not_reached ();
              break;

            case G_UNICODE_BREAK_AMBIGUOUS:
              /* FIXME we need to resolve the East Asian width
               * to decide what to do here
               */
            case G_UNICODE_BREAK_COMPLEX_CONTEXT:
              /* FIXME language engines should handle this case... */
            case G_UNICODE_BREAK_UNKNOWN:
              /* treat unknown, complex, ambiguous as if they were
               * alphabetic for now.
               */
              prev_break_type = G_UNICODE_BREAK_ALPHABETIC;
              /* FALL THRU to use the pair table if appropriate */

            default:

              /* Note that our table assumes that combining marks
               * are only applied to alphabetic characters;
               * tech report 14 explains how to remove this assumption
               * from the code, if anyone ever cares, but it shouldn't
               * be a problem. Also this issue sort of goes
               * away since we only look for breaks on grapheme
               * boundaries.
               */

              g_assert (IN_BREAK_TABLE (prev_break_type));

              switch (break_type)
                {
                case G_UNICODE_BREAK_MANDATORY:
                case G_UNICODE_BREAK_LINE_FEED:
                case G_UNICODE_BREAK_CARRIAGE_RETURN:
                case G_UNICODE_BREAK_NEXT_LINE:
                case G_UNICODE_BREAK_SPACE:
                  /* These types all "pile up" at the end of lines and
                   * get elided.
                   */
                  break_op = BREAK_PROHIBITED;
                  break;

                case G_UNICODE_BREAK_CONTINGENT:
                  /* break before 0xFFFC by default, eventually
                   * make this configurable?
                   */
                  break_op = BREAK_ALLOWED;
                  break;

                case G_UNICODE_BREAK_AMBIGUOUS:
                  /* FIXME resolve East Asian width to figure out what to do */
                case G_UNICODE_BREAK_COMPLEX_CONTEXT:
                  /* FIXME language engine analysis */
                case G_UNICODE_BREAK_UNKNOWN:
                case G_UNICODE_BREAK_ALPHABETIC:
                  /* treat all of the above as alphabetic for now */
                  break_op = BREAK_OP (prev_break_type, G_UNICODE_BREAK_ALPHABETIC);
                  break;

                case G_UNICODE_BREAK_SURROGATE:
              g_assert_not_reached ();
                  break;

                default:
                  g_assert (IN_BREAK_TABLE (prev_break_type));
                  g_assert (IN_BREAK_TABLE (break_type));
                  break_op = BREAK_OP (prev_break_type, break_type);
                  break;
                }
              break;
            }

          if (break_op != BREAK_ALREADY_HANDLED)
            {
              switch (break_op)
                {
                case BREAK_PROHIBITED:
                  /* can't break here */                  
                  attrs[i].is_char_break = FALSE;
                  break;

                case BREAK_IF_SPACES:
                  /* break if prev char was space */
                  if (prev_was_break_space)                    
                    attrs[i].is_line_break = TRUE;
                  break;

                case BREAK_ALLOWED:
                  attrs[i].is_line_break = TRUE;
                  break;

                default:
                  g_assert_not_reached ();
                  break;
                }
            }
        }

      if (break_type != G_UNICODE_BREAK_SPACE)
        {
          prev_break_type = break_type;
          prev_was_break_space = FALSE;
        }
      else
        prev_was_break_space = TRUE;

#endif
      /* ---- Word breaks ---- */

      /* default to not a word start/end */
      attrs[i].is_word_start = FALSE;
      attrs[i].is_word_end = FALSE;

      if (current_word_type != WordNone)
        {
          /* Check for a word end */
          switch (type)
            {
            case G_UNICODE_COMBINING_MARK:
            case G_UNICODE_ENCLOSING_MARK:
            case G_UNICODE_NON_SPACING_MARK:
          case G_UNICODE_FORMAT:
              /* nothing, we just eat these up as part of the word */
              break;

            case G_UNICODE_LOWERCASE_LETTER:
            case G_UNICODE_MODIFIER_LETTER:
            case G_UNICODE_OTHER_LETTER:
            case G_UNICODE_TITLECASE_LETTER:
            case G_UNICODE_UPPERCASE_LETTER:
              if (current_word_type == WordLetters)
                {
                  /* Japanese special cases for ending the word */
                  if (JAPANESE (last_word_letter) ||
                      JAPANESE (wc))
                    {
                      if ((HIRAGANA (last_word_letter) &&
                           !HIRAGANA (wc)) ||
                          (KATAKANA (last_word_letter) &&
                           !(KATAKANA (wc) || HIRAGANA (wc))) ||
                          (KANJI (last_word_letter) &&
                           !(HIRAGANA (wc) || KANJI (wc))) ||
                          (JAPANESE (last_word_letter) &&
                           !JAPANESE (wc)) ||
                          (!JAPANESE (last_word_letter) &&
                           JAPANESE (wc)))
                        attrs[i].is_word_end = TRUE;
                    }
                }
              else
                {
                  /* end the number word, start the letter word */
                  attrs[i].is_word_end = TRUE;
                  attrs[i].is_word_start = TRUE;
                  current_word_type = WordLetters;
                }

              last_word_letter = wc;
              break;

            case G_UNICODE_DECIMAL_NUMBER:
            case G_UNICODE_LETTER_NUMBER:
            case G_UNICODE_OTHER_NUMBER:
              if (current_word_type != WordNumbers)
                {
                  attrs[i].is_word_end = TRUE;
                  attrs[i].is_word_start = TRUE;
                  current_word_type = WordNumbers;
                }

              last_word_letter = wc;
              break;

            default:
              /* Punctuation, control/format chars, etc. all end a word. */
              attrs[i].is_word_end = TRUE;
            current_word_type = WordNone;
              break;
            }
        }
      else
        {
          /* Check for a word start */
          switch (type)
            {
            case G_UNICODE_LOWERCASE_LETTER:
            case G_UNICODE_MODIFIER_LETTER:
            case G_UNICODE_OTHER_LETTER:
            case G_UNICODE_TITLECASE_LETTER:
            case G_UNICODE_UPPERCASE_LETTER:
              current_word_type = WordLetters;
              last_word_letter = wc;
              attrs[i].is_word_start = TRUE;
              break;

            case G_UNICODE_DECIMAL_NUMBER:
            case G_UNICODE_LETTER_NUMBER:
            case G_UNICODE_OTHER_NUMBER:
              current_word_type = WordNumbers;
              last_word_letter = wc;
              attrs[i].is_word_start = TRUE;
              break;

            default:
              /* No word here */
              break;
            }
        }

      /* ---- Sentence breaks ---- */
      
      /* The Unicode spec specifies sentence breakpoints, so that a piece of
       * text would be partitioned into sentences, and all characters would
       * be inside some sentence. This code implements that for is_sentence_boundary,
       * but tries to keep leading/trailing whitespace out of sentences for
       * the start/end flags
       */

      /* The Unicode spec seems to say that one trailing line/para
       * separator can be tacked on to a sentence ending in ! or ?,
       * but not a sentence ending in period; I think they're on crack
       * so am allowing one to be tacked onto a sentence ending in period.
       */

#define MAYBE_START_NEW_SENTENCE                                \
              switch (type)                                     \
                {                                               \
                case G_UNICODE_LINE_SEPARATOR:                  \
                case G_UNICODE_PARAGRAPH_SEPARATOR:             \
                case G_UNICODE_CONTROL:                         \
                case G_UNICODE_FORMAT:                          \
                case G_UNICODE_SPACE_SEPARATOR:                 \
                  sentence_state = STATE_SENTENCE_OUTSIDE;      \
                  break;                                        \
                                                                \
                default:                                        \
                  sentence_state = STATE_SENTENCE_BODY;         \
                  attrs[i].is_sentence_start = TRUE;            \
                  break;                                        \
                }
      
      /* No sentence break at the start of the text */

      /* default to not a sentence breakpoint */
      attrs[i].is_sentence_boundary = FALSE;
      attrs[i].is_sentence_start = FALSE;
      attrs[i].is_sentence_end = FALSE;
      
      /* FIXME the Unicode spec lumps control/format chars with
       * line/para separators in descriptive text, but not in the
       * character class specs, in table 5-6, so who knows whether you
       * are actually supposed to break on control/format
       * characters. Seems semi-broken to break on tabs...
       */

      /* Break after line/para separators except carriage return
       * followed by newline
       */
      switch (prev_type)
        {
        case G_UNICODE_LINE_SEPARATOR:
        case G_UNICODE_PARAGRAPH_SEPARATOR:
        case G_UNICODE_CONTROL:
        case G_UNICODE_FORMAT:
          if (wc == '\r')
            {
              if (next_wc != '\n')
                attrs[i].is_sentence_boundary = TRUE;
            }
          else
            attrs[i].is_sentence_boundary = TRUE;
          break;

        default:
          break;
        }

      /* break before para/line separators except newline following
       * carriage return
       */
      switch (type)
        {
        case G_UNICODE_LINE_SEPARATOR:
        case G_UNICODE_PARAGRAPH_SEPARATOR:
        case G_UNICODE_CONTROL:
        case G_UNICODE_FORMAT:
          if (wc == '\n')
            {
              if (prev_wc != '\r')
                attrs[i].is_sentence_boundary = TRUE;
            }
          else
            attrs[i].is_sentence_boundary = TRUE;
          break;

        default:
          break;
        }

      switch (sentence_state)
        {
        case STATE_SENTENCE_OUTSIDE:
          /* Start sentence if we have non-whitespace/format/control */
          switch (type)
            {
            case G_UNICODE_LINE_SEPARATOR:
            case G_UNICODE_PARAGRAPH_SEPARATOR:
            case G_UNICODE_CONTROL:
            case G_UNICODE_FORMAT:
            case G_UNICODE_SPACE_SEPARATOR:
              break;

            default:
              attrs[i].is_sentence_start = TRUE;
              sentence_state = STATE_SENTENCE_BODY;
              break;
            }
          break;

        case STATE_SENTENCE_BODY:
          /* If we already broke here due to separators, end the sentence. */
          if (attrs[i].is_sentence_boundary)
            {
              attrs[i].is_sentence_end = TRUE;

              MAYBE_START_NEW_SENTENCE;
            }
          else
            {
              if (wc == '.')
                sentence_state = STATE_SENTENCE_DOT;
              else if (wc == '?' || wc == '!')
                sentence_state = STATE_SENTENCE_TERM;
            }
          break;

        case STATE_SENTENCE_TERM:
          /* End sentence on anything but close punctuation and some
           * loosely-specified OTHER_PUNCTUATION such as period,
           * comma, etc.; follow Unicode rules for breaks
           */
          switch (type)
            {
            case G_UNICODE_OTHER_PUNCTUATION:
            case G_UNICODE_CLOSE_PUNCTUATION:
              if (type == G_UNICODE_CLOSE_PUNCTUATION ||
                  wc == '.' ||
                  wc == ',' ||
                  wc == '?' ||
                  wc == '!')
                sentence_state = STATE_SENTENCE_POST_TERM_CLOSE;
              else
                {
                  attrs[i].is_sentence_end = TRUE;
                  attrs[i].is_sentence_boundary = TRUE;

                  MAYBE_START_NEW_SENTENCE;
                }
              break;

            case G_UNICODE_SPACE_SEPARATOR:
              attrs[i].is_sentence_end = TRUE;
              sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
              break;

            case G_UNICODE_LINE_SEPARATOR:
            case G_UNICODE_PARAGRAPH_SEPARATOR:
              attrs[i].is_sentence_end = TRUE;
              sentence_state = STATE_SENTENCE_POST_TERM_SEP;
              break;

            default:
              attrs[i].is_sentence_end = TRUE;
              attrs[i].is_sentence_boundary = TRUE;

              MAYBE_START_NEW_SENTENCE;

              break;
            }
          break;

        case STATE_SENTENCE_POST_TERM_CLOSE:
          /* End sentence on anything besides more punctuation; follow
           * rules for breaks
           */
          switch (type)
            {
            case G_UNICODE_OTHER_PUNCTUATION:
            case G_UNICODE_CLOSE_PUNCTUATION:
              if (type == G_UNICODE_CLOSE_PUNCTUATION ||
                  wc == '.' ||
                  wc == ',' ||
                  wc == '?' ||
                  wc == '!')
                /* continue in this state */
                ;
              else
                {
                  attrs[i].is_sentence_end = TRUE;
                  attrs[i].is_sentence_boundary = TRUE;

                  MAYBE_START_NEW_SENTENCE;
                }
              break;

            case G_UNICODE_SPACE_SEPARATOR:
              attrs[i].is_sentence_end = TRUE;
              sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
              break;

            case G_UNICODE_LINE_SEPARATOR:
            case G_UNICODE_PARAGRAPH_SEPARATOR:
              attrs[i].is_sentence_end = TRUE;
              /* undo the unconditional break-at-all-line/para-separators
               * from above; I'm not sure this is what the Unicode spec
               * intends, but it seems right - we get to include
               * a single line/para separator in the sentence according
               * to their rules
               */
              attrs[i].is_sentence_boundary = FALSE;
              sentence_state = STATE_SENTENCE_POST_TERM_SEP;
              break;

            default:
              attrs[i].is_sentence_end = TRUE;
              attrs[i].is_sentence_boundary = TRUE;

              MAYBE_START_NEW_SENTENCE;

              break;
            }
          break;

        case STATE_SENTENCE_POST_TERM_SPACE:

          /* Sentence is definitely already ended; to enter this state
           * we had to see a space, which ends the sentence.
           */

          switch (type)
            {
            case G_UNICODE_SPACE_SEPARATOR:
              /* continue in this state */
              break;

            case G_UNICODE_LINE_SEPARATOR:
            case G_UNICODE_PARAGRAPH_SEPARATOR:
              /* undo the unconditional break-at-all-line/para-separators
               * from above; I'm not sure this is what the Unicode spec
               * intends, but it seems right
               */
              attrs[i].is_sentence_boundary = FALSE;
              sentence_state = STATE_SENTENCE_POST_TERM_SEP;
              break;

            default:
              attrs[i].is_sentence_boundary = TRUE;

              MAYBE_START_NEW_SENTENCE;

              break;
            }
          break;

        case STATE_SENTENCE_POST_TERM_SEP:
          /* Break is forced at this point, unless we're a newline
           * after a CR, then we will break after the newline on the
           * next iteration. Only a single Sep can be in the
           * sentence.
           */
          if (!(prev_wc == '\r' && wc == '\n'))
            attrs[i].is_sentence_boundary = TRUE;

          MAYBE_START_NEW_SENTENCE;

          break;

        case STATE_SENTENCE_DOT:
          switch (type)
            {
            case G_UNICODE_CLOSE_PUNCTUATION:
              sentence_state = STATE_SENTENCE_POST_DOT_CLOSE;
              break;

            case G_UNICODE_SPACE_SEPARATOR:
              possible_sentence_end = i;
              sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
              break;

            default:
              /* If we broke on a control/format char, end the
               * sentence; else this was not a sentence end, since
               * we didn't enter the POST_DOT_SPACE state.
               */
              if (attrs[i].is_sentence_boundary)
                {
                  attrs[i].is_sentence_end = TRUE;

                  MAYBE_START_NEW_SENTENCE;
                }
              else
                sentence_state = STATE_SENTENCE_BODY;
              break;
            }
          break;

        case STATE_SENTENCE_POST_DOT_CLOSE:
          switch (type)
            {
            case G_UNICODE_SPACE_SEPARATOR:
              possible_sentence_end = i;
              sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
              break;

            default:
              /* If we broke on a control/format char, end the
               * sentence; else this was not a sentence end, since
               * we didn't enter the POST_DOT_SPACE state.
               */
              if (attrs[i].is_sentence_boundary)
                {
                  attrs[i].is_sentence_end = TRUE;

                  MAYBE_START_NEW_SENTENCE;
                }
              else
                sentence_state = STATE_SENTENCE_BODY;
              break;
            }
          break;

        case STATE_SENTENCE_POST_DOT_SPACE:

          possible_sentence_boundary = i;

          switch (type)
            {
            case G_UNICODE_SPACE_SEPARATOR:
              /* remain in current state */
              break;

            case G_UNICODE_OPEN_PUNCTUATION:
              sentence_state = STATE_SENTENCE_POST_DOT_OPEN;
              break;

            case G_UNICODE_LOWERCASE_LETTER:
              /* wasn't a sentence-ending period; so re-enter the sentence
               * body
               */
              sentence_state = STATE_SENTENCE_BODY;
              break;

            default:
              /* End the sentence, break, maybe start a new one */

              g_assert (possible_sentence_end >= 0);
              g_assert (possible_sentence_boundary >= 0);

              attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
              attrs[possible_sentence_end].is_sentence_end = TRUE;

              possible_sentence_end = -1;
              possible_sentence_boundary = -1;

              MAYBE_START_NEW_SENTENCE;
              
              break;
            }
          break;

        case STATE_SENTENCE_POST_DOT_OPEN:
          switch (type)
            {
            case G_UNICODE_OPEN_PUNCTUATION:
              /* continue in current state */
              break;

            case G_UNICODE_LOWERCASE_LETTER:
              /* wasn't a sentence-ending period; so re-enter the sentence
               * body
               */
              sentence_state = STATE_SENTENCE_BODY;
              break;

            default:
              /* End the sentence, break, maybe start a new one */

              g_assert (possible_sentence_end >= 0);
              g_assert (possible_sentence_boundary >= 0);

              attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
              attrs[possible_sentence_end].is_sentence_end = TRUE;

              possible_sentence_end = -1;
              possible_sentence_boundary = -1;

              MAYBE_START_NEW_SENTENCE;

              break;
            }
          break;

        case STATE_SENTENCE_POST_DOT_SEP:
          /* Break is forced at this point, unless we're a newline
           * after a CR, then we will break after the newline on the
           * next iteration. Only a single Sep can be in the
           * sentence.
           */
          if (!(prev_wc == '\r' && wc == '\n'))
            attrs[i].is_sentence_boundary = TRUE;

          g_assert (possible_sentence_end >= 0);
          g_assert (possible_sentence_boundary >= 0);

          attrs[possible_sentence_end].is_sentence_end = TRUE;

          possible_sentence_end = -1;
          possible_sentence_boundary = -1;

          MAYBE_START_NEW_SENTENCE;

          break;

        default:
          g_assert_not_reached ();
          break;
        }

      prev_type = type;
      prev_wc = wc;

      /* wc might not be a valid unicode base character, but really all we
       * need to know is the last non-combining character */
      if (type != G_UNICODE_COMBINING_MARK && 
          type != G_UNICODE_ENCLOSING_MARK && 
          type != G_UNICODE_NON_SPACING_MARK)
        base_character = wc;
    }
}



void
graphite_engine_break (PangoEngineLang *engine,
            const gchar   *text,
            gint           length,
            PangoAnalysis *analysis,
            PangoLogAttr  *attrs,
            int            attrs_len)
{
      //g_message("graphite_engine_break (%s:%d)", text, length);
      PangoFcFont *fcfont = (PangoFcFont *) analysis->font;
      FcChar8 * fontcap;

    if (fcfont)
    {
        if (FcPatternGetString (fcfont->font_pattern, FC_CAPABILITY, 0, &fontcap) != FcResultMatch)
        {
            //g_message("graphite_engine_break: not a complex font (%s:%d)", text, length);
        }
        else if (strstr((char*)fontcap, "ttable:Silf"))
        {
            //g_message("graphite_engine_break: is a Graphite font %d:%s", length, text);
            return pango_default_break_for_graphite(text, length, analysis, attrs, attrs_len);
        }
        else
        {
            //g_message("graphite_engine_break: not a Graphite font %s", fontcap);
            pango_default_break(text, length, analysis, attrs, attrs_len);
        }
    }

      pango_default_break(text, length, analysis, attrs, attrs_len);
    int i;
    if (have_graphite)
    {
        for (i=0; i < attrs_len; i++)
        {
            attrs[i].backspace_deletes_character = TRUE;
        }
    }
}




static void
init_graphite (void)
{
      if (!graphite_initialised)
      {
            graphite_initialised = TRUE;
            //g_message("opening libgraphitepango");
            graphitelibrary = g_module_open("libgraphitepango.so.0", G_MODULE_BIND_LAZY);
            if (graphitelibrary)
            {
                  //g_message("looking for graphite_PangoLogAttrs");
            gpointer gp_break;
                  have_graphite = g_module_symbol(graphitelibrary,
                        "graphite_PangoLogAttrs",
                        &gp_break);
                graphite_real_break = gp_break;
                  //if (!have_graphite) g_message("Failed to find graphite_PangoLogAttrs");
                  //else g_message("Found graphite_PangoLogAttrs");
            }
      }
} 


static void
graphite_engine_break_class_init (PangoEngineLangClass *class)
{
    class->script_break = graphite_engine_break;
}

PANGO_ENGINE_LANG_DEFINE_TYPE (GraphiteEngineBreak, graphite_engine_break,
                        graphite_engine_break_class_init, NULL);

void 
PANGO_MODULE_ENTRY(init) (GTypeModule *module)
{
    init_graphite();
    graphite_engine_break_register_type (module);
    //g_message("init pangographite-break");
}

void 
PANGO_MODULE_ENTRY(exit) (void)
{
      //g_message("exit pangographite-break");
}

void 
PANGO_MODULE_ENTRY(list) (PangoEngineInfo **engines,
                    int              *n_engines)
{
    init_graphite();
    *engines = script_engines;
    *n_engines = G_N_ELEMENTS (script_engines);
}

PangoEngine *
PANGO_MODULE_ENTRY(create) (const char *id)
{
    if (!strcmp (id, LANG_ENGINE_NAME))
        return g_object_new (graphite_engine_break_type, NULL);
    else
        return NULL;
}



Generated by  Doxygen 1.6.0   Back to index