/* YUI 3.17.2 (build 9c3c78e) Copyright 2014 Yahoo! Inc. All rights reserved. Licensed under the BSD License. http://yuilibrary.com/license/ */ YUI.add('text-wordbreak', function (Y, NAME) { /** * Provides utility methods for splitting strings on word breaks and determining * whether a character index represents a word boundary. * * @module text * @submodule text-wordbreak */ /** *
* Provides utility methods for splitting strings on word breaks and determining * whether a character index represents a word boundary, using the generic word * breaking algorithm defined in the Unicode Text Segmentation guidelines * (Unicode Standard * Annex #29). *
* ** This algorithm provides a reasonable default for many languages. However, it * does not cover language or context specific requirements, and it does not * provide meaningful results at all for languages that don't use spaces between * words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based * word breaking services usually provide significantly better results with * better performance. *
* * @class Text.WordBreak * @static */ var Text = Y.Text, WBData = Text.Data.WordBreak, // Constants representing code point classifications. ALETTER = 0, MIDNUMLET = 1, MIDLETTER = 2, MIDNUM = 3, NUMERIC = 4, CR = 5, LF = 6, NEWLINE = 7, EXTEND = 8, FORMAT = 9, KATAKANA = 10, EXTENDNUMLET = 11, OTHER = 12, // RegExp objects generated from code point data. Each regex matches a single // character against a set of Unicode code points. The index of each item in // this array must match its corresponding code point constant value defined // above. SETS = [ new RegExp(WBData.aletter), new RegExp(WBData.midnumlet), new RegExp(WBData.midletter), new RegExp(WBData.midnum), new RegExp(WBData.numeric), new RegExp(WBData.cr), new RegExp(WBData.lf), new RegExp(WBData.newline), new RegExp(WBData.extend), new RegExp(WBData.format), new RegExp(WBData.katakana), new RegExp(WBData.extendnumlet) ], EMPTY_STRING = '', PUNCTUATION = new RegExp('^' + WBData.punctuation + '$'), WHITESPACE = /\s/, WordBreak = { // -- Public Static Methods ------------------------------------------------ /** * Splits the specified string into an array of individual words. * * @method getWords * @param {String} string String to split. * @param {Object} options (optional) Options object containing zero or more * of the following properties: * *true
, the string will be converted to lowercase
* before being split. Default is false
.
* true
, the returned array will include punctuation
* characters. Default is false
.
* true
, the returned array will include whitespace
* characters. Default is false
.
* 'foo bar baz foo'
would result in
* the array ['foo', 'bar', 'baz']
.
*
* @method getUniqueWords
* @param {String} string String to split.
* @param {Object} options (optional) Options (see getWords()
* for details).
* @return {Array} Array of unique words.
* @static
*/
getUniqueWords: function (string, options) {
return Y.Array.unique(WordBreak.getWords(string, options));
},
/**
*
* Returns true
if there is a word boundary between the
* specified character index and the next character index (or the end of the
* string).
*
* Note that there are always word breaks at the beginning and end of a
* string, so isWordBoundary('', 0)
and
* isWordBoundary('a', 0)
will both return true
.
*
true
for a word boundary,
* false
otherwise.
* @static
*/
isWordBoundary: function (string, index) {
return WordBreak._isWordBoundary(WordBreak._classify(string), index);
},
// -- Protected Static Methods ---------------------------------------------
/**
* Returns a character classification map for the specified string.
*
* @method _classify
* @param {String} string String to classify.
* @return {Array} Classification map.
* @protected
* @static
*/
_classify: function (string) {
var chr,
map = [],
i = 0,
j,
set,
stringLength = string.length,
setsLength = SETS.length,
type;
for (; i < stringLength; ++i) {
chr = string.charAt(i);
type = OTHER;
for (j = 0; j < setsLength; ++j) {
set = SETS[j];
if (set && set.test(chr)) {
type = j;
break;
}
}
map.push(type);
}
return map;
},
/**
*
* Returns true
if there is a word boundary between the
* specified character index and the next character index (or the end of the
* string).
*
* Note that there are always word breaks at the beginning and end of a
* string, so _isWordBoundary('', 0)
and
* _isWordBoundary('a', 0)
will both return true
.
*
_classify
.
* @param {Number} index Character index to test.
* @return {Boolean}
* @protected
* @static
*/
_isWordBoundary: function (map, index) {
var prevType,
type = map[index],
nextType = map[index + 1],
nextNextType;
if (index < 0 || (index > map.length - 1 && index !== 0)) {
Y.log('isWordBoundary: index out of bounds', 'warn', 'text-wordbreak');
return false;
}
// WB5. Don't break between most letters.
if (type === ALETTER && nextType === ALETTER) {
return false;
}
nextNextType = map[index + 2];
// WB6. Don't break letters across certain punctuation.
if (type === ALETTER &&
(nextType === MIDLETTER || nextType === MIDNUMLET) &&
nextNextType === ALETTER) {
return false;
}
prevType = map[index - 1];
// WB7. Don't break letters across certain punctuation.
if ((type === MIDLETTER || type === MIDNUMLET) &&
nextType === ALETTER &&
prevType === ALETTER) {
return false;
}
// WB8/WB9/WB10. Don't break inside sequences of digits or digits
// adjacent to letters.
if ((type === NUMERIC || type === ALETTER) &&
(nextType === NUMERIC || nextType === ALETTER)) {
return false;
}
// WB11. Don't break inside numeric sequences like "3.2" or
// "3,456.789".
if ((type === MIDNUM || type === MIDNUMLET) &&
nextType === NUMERIC &&
prevType === NUMERIC) {
return false;
}
// WB12. Don't break inside numeric sequences like "3.2" or
// "3,456.789".
if (type === NUMERIC &&
(nextType === MIDNUM || nextType === MIDNUMLET) &&
nextNextType === NUMERIC) {
return false;
}
// WB4. Ignore format and extend characters.
if (type === EXTEND || type === FORMAT ||
prevType === EXTEND || prevType === FORMAT ||
nextType === EXTEND || nextType === FORMAT) {
return false;
}
// WB3. Don't break inside CRLF.
if (type === CR && nextType === LF) {
return false;
}
// WB3a. Break before newlines (including CR and LF).
if (type === NEWLINE || type === CR || type === LF) {
return true;
}
// WB3b. Break after newlines (including CR and LF).
if (nextType === NEWLINE || nextType === CR || nextType === LF) {
return true;
}
// WB13. Don't break between Katakana characters.
if (type === KATAKANA && nextType === KATAKANA) {
return false;
}
// WB13a. Don't break from extenders.
if (nextType === EXTENDNUMLET &&
(type === ALETTER || type === NUMERIC || type === KATAKANA ||
type === EXTENDNUMLET)) {
return false;
}
// WB13b. Don't break from extenders.
if (type === EXTENDNUMLET &&
(nextType === ALETTER || nextType === NUMERIC ||
nextType === KATAKANA)) {
return false;
}
// Break after any character not covered by the rules above.
return true;
}
};
Text.WordBreak = WordBreak;
}, '3.17.2', {"requires": ["array-extras", "text-data-wordbreak"]});