You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
371 lines
11 KiB
371 lines
11 KiB
2 years ago
|
/*
|
||
|
YUI 3.17.2 (build 9c3c78e)
|
||
|
Copyright 2014 Yahoo! Inc. All rights reserved.
|
||
|
Licensed under the BSD License.
|
||
|
http://yuilibrary.com/license/
|
||
|
*/
|
||
|
|
||
|
YUI.add('text-wordbreak', function (Y, NAME) {
|
||
|
|
||
|
/**
|
||
|
* Provides utility methods for splitting strings on word breaks and determining
|
||
|
* whether a character index represents a word boundary.
|
||
|
*
|
||
|
* @module text
|
||
|
* @submodule text-wordbreak
|
||
|
*/
|
||
|
|
||
|
/**
|
||
|
* <p>
|
||
|
* Provides utility methods for splitting strings on word breaks and determining
|
||
|
* whether a character index represents a word boundary, using the generic word
|
||
|
* breaking algorithm defined in the Unicode Text Segmentation guidelines
|
||
|
* (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard
|
||
|
* Annex #29</a>).
|
||
|
* </p>
|
||
|
*
|
||
|
* <p>
|
||
|
* This algorithm provides a reasonable default for many languages. However, it
|
||
|
* does not cover language or context specific requirements, and it does not
|
||
|
* provide meaningful results at all for languages that don't use spaces between
|
||
|
* words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based
|
||
|
* word breaking services usually provide significantly better results with
|
||
|
* better performance.
|
||
|
* </p>
|
||
|
*
|
||
|
* @class Text.WordBreak
|
||
|
* @static
|
||
|
*/
|
||
|
|
||
|
var Text = Y.Text,
|
||
|
WBData = Text.Data.WordBreak,
|
||
|
|
||
|
// Constants representing code point classifications.
|
||
|
ALETTER = 0,
|
||
|
MIDNUMLET = 1,
|
||
|
MIDLETTER = 2,
|
||
|
MIDNUM = 3,
|
||
|
NUMERIC = 4,
|
||
|
CR = 5,
|
||
|
LF = 6,
|
||
|
NEWLINE = 7,
|
||
|
EXTEND = 8,
|
||
|
FORMAT = 9,
|
||
|
KATAKANA = 10,
|
||
|
EXTENDNUMLET = 11,
|
||
|
OTHER = 12,
|
||
|
|
||
|
// RegExp objects generated from code point data. Each regex matches a single
|
||
|
// character against a set of Unicode code points. The index of each item in
|
||
|
// this array must match its corresponding code point constant value defined
|
||
|
// above.
|
||
|
SETS = [
|
||
|
new RegExp(WBData.aletter),
|
||
|
new RegExp(WBData.midnumlet),
|
||
|
new RegExp(WBData.midletter),
|
||
|
new RegExp(WBData.midnum),
|
||
|
new RegExp(WBData.numeric),
|
||
|
new RegExp(WBData.cr),
|
||
|
new RegExp(WBData.lf),
|
||
|
new RegExp(WBData.newline),
|
||
|
new RegExp(WBData.extend),
|
||
|
new RegExp(WBData.format),
|
||
|
new RegExp(WBData.katakana),
|
||
|
new RegExp(WBData.extendnumlet)
|
||
|
],
|
||
|
|
||
|
EMPTY_STRING = '',
|
||
|
PUNCTUATION = new RegExp('^' + WBData.punctuation + '$'),
|
||
|
WHITESPACE = /\s/,
|
||
|
|
||
|
WordBreak = {
|
||
|
// -- Public Static Methods ------------------------------------------------
|
||
|
|
||
|
/**
|
||
|
* Splits the specified string into an array of individual words.
|
||
|
*
|
||
|
* @method getWords
|
||
|
* @param {String} string String to split.
|
||
|
* @param {Object} options (optional) Options object containing zero or more
|
||
|
* of the following properties:
|
||
|
*
|
||
|
* <dl>
|
||
|
* <dt>ignoreCase (Boolean)</dt>
|
||
|
* <dd>
|
||
|
* If <code>true</code>, the string will be converted to lowercase
|
||
|
* before being split. Default is <code>false</code>.
|
||
|
* </dd>
|
||
|
*
|
||
|
* <dt>includePunctuation (Boolean)</dt>
|
||
|
* <dd>
|
||
|
* If <code>true</code>, the returned array will include punctuation
|
||
|
* characters. Default is <code>false</code>.
|
||
|
* </dd>
|
||
|
*
|
||
|
* <dt>includeWhitespace (Boolean)</dt>
|
||
|
* <dd>
|
||
|
* If <code>true</code>, the returned array will include whitespace
|
||
|
* characters. Default is <code>false</code>.
|
||
|
* </dd>
|
||
|
* </dl>
|
||
|
* @return {Array} Array of words.
|
||
|
* @static
|
||
|
*/
|
||
|
getWords: function (string, options) {
|
||
|
var i = 0,
|
||
|
map = WordBreak._classify(string),
|
||
|
len = map.length,
|
||
|
word = [],
|
||
|
words = [],
|
||
|
chr,
|
||
|
includePunctuation,
|
||
|
includeWhitespace;
|
||
|
|
||
|
if (!options) {
|
||
|
options = {};
|
||
|
}
|
||
|
|
||
|
if (options.ignoreCase) {
|
||
|
string = string.toLowerCase();
|
||
|
}
|
||
|
|
||
|
includePunctuation = options.includePunctuation;
|
||
|
includeWhitespace = options.includeWhitespace;
|
||
|
|
||
|
// Loop through each character in the classification map and determine
|
||
|
// whether it precedes a word boundary, building an array of distinct
|
||
|
// words as we go.
|
||
|
for (; i < len; ++i) {
|
||
|
chr = string.charAt(i);
|
||
|
|
||
|
// Append this character to the current word.
|
||
|
word.push(chr);
|
||
|
|
||
|
// If there's a word boundary between the current character and the
|
||
|
// next character, append the current word to the words array and
|
||
|
// start building a new word.
|
||
|
if (WordBreak._isWordBoundary(map, i)) {
|
||
|
word = word.join(EMPTY_STRING);
|
||
|
|
||
|
if (word &&
|
||
|
(includeWhitespace || !WHITESPACE.test(word)) &&
|
||
|
(includePunctuation || !PUNCTUATION.test(word))) {
|
||
|
words.push(word);
|
||
|
}
|
||
|
|
||
|
word = [];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return words;
|
||
|
},
|
||
|
|
||
|
/**
|
||
|
* Returns an array containing only unique words from the specified string.
|
||
|
* For example, the string <code>'foo bar baz foo'</code> would result in
|
||
|
* the array <code>['foo', 'bar', 'baz']</code>.
|
||
|
*
|
||
|
* @method getUniqueWords
|
||
|
* @param {String} string String to split.
|
||
|
* @param {Object} options (optional) Options (see <code>getWords()</code>
|
||
|
* for details).
|
||
|
* @return {Array} Array of unique words.
|
||
|
* @static
|
||
|
*/
|
||
|
getUniqueWords: function (string, options) {
|
||
|
return Y.Array.unique(WordBreak.getWords(string, options));
|
||
|
},
|
||
|
|
||
|
/**
|
||
|
* <p>
|
||
|
* Returns <code>true</code> if there is a word boundary between the
|
||
|
* specified character index and the next character index (or the end of the
|
||
|
* string).
|
||
|
* </p>
|
||
|
*
|
||
|
* <p>
|
||
|
* Note that there are always word breaks at the beginning and end of a
|
||
|
* string, so <code>isWordBoundary('', 0)</code> and
|
||
|
* <code>isWordBoundary('a', 0)</code> will both return <code>true</code>.
|
||
|
* </p>
|
||
|
*
|
||
|
* @method isWordBoundary
|
||
|
* @param {String} string String to test.
|
||
|
* @param {Number} index Character index to test within the string.
|
||
|
* @return {Boolean} <code>true</code> for a word boundary,
|
||
|
* <code>false</code> otherwise.
|
||
|
* @static
|
||
|
*/
|
||
|
isWordBoundary: function (string, index) {
|
||
|
return WordBreak._isWordBoundary(WordBreak._classify(string), index);
|
||
|
},
|
||
|
|
||
|
// -- Protected Static Methods ---------------------------------------------
|
||
|
|
||
|
/**
|
||
|
* Returns a character classification map for the specified string.
|
||
|
*
|
||
|
* @method _classify
|
||
|
* @param {String} string String to classify.
|
||
|
* @return {Array} Classification map.
|
||
|
* @protected
|
||
|
* @static
|
||
|
*/
|
||
|
_classify: function (string) {
|
||
|
var chr,
|
||
|
map = [],
|
||
|
i = 0,
|
||
|
j,
|
||
|
set,
|
||
|
stringLength = string.length,
|
||
|
setsLength = SETS.length,
|
||
|
type;
|
||
|
|
||
|
for (; i < stringLength; ++i) {
|
||
|
chr = string.charAt(i);
|
||
|
type = OTHER;
|
||
|
|
||
|
for (j = 0; j < setsLength; ++j) {
|
||
|
set = SETS[j];
|
||
|
|
||
|
if (set && set.test(chr)) {
|
||
|
type = j;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
map.push(type);
|
||
|
}
|
||
|
|
||
|
return map;
|
||
|
},
|
||
|
|
||
|
/**
|
||
|
* <p>
|
||
|
* Returns <code>true</code> if there is a word boundary between the
|
||
|
* specified character index and the next character index (or the end of the
|
||
|
* string).
|
||
|
* </p>
|
||
|
*
|
||
|
* <p>
|
||
|
* Note that there are always word breaks at the beginning and end of a
|
||
|
* string, so <code>_isWordBoundary('', 0)</code> and
|
||
|
* <code>_isWordBoundary('a', 0)</code> will both return <code>true</code>.
|
||
|
* </p>
|
||
|
*
|
||
|
* @method _isWordBoundary
|
||
|
* @param {Array} map Character classification map generated by
|
||
|
* <code>_classify</code>.
|
||
|
* @param {Number} index Character index to test.
|
||
|
* @return {Boolean}
|
||
|
* @protected
|
||
|
* @static
|
||
|
*/
|
||
|
_isWordBoundary: function (map, index) {
|
||
|
var prevType,
|
||
|
type = map[index],
|
||
|
nextType = map[index + 1],
|
||
|
nextNextType;
|
||
|
|
||
|
if (index < 0 || (index > map.length - 1 && index !== 0)) {
|
||
|
Y.log('isWordBoundary: index out of bounds', 'warn', 'text-wordbreak');
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// WB5. Don't break between most letters.
|
||
|
if (type === ALETTER && nextType === ALETTER) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
nextNextType = map[index + 2];
|
||
|
|
||
|
// WB6. Don't break letters across certain punctuation.
|
||
|
if (type === ALETTER &&
|
||
|
(nextType === MIDLETTER || nextType === MIDNUMLET) &&
|
||
|
nextNextType === ALETTER) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
prevType = map[index - 1];
|
||
|
|
||
|
// WB7. Don't break letters across certain punctuation.
|
||
|
if ((type === MIDLETTER || type === MIDNUMLET) &&
|
||
|
nextType === ALETTER &&
|
||
|
prevType === ALETTER) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// WB8/WB9/WB10. Don't break inside sequences of digits or digits
|
||
|
// adjacent to letters.
|
||
|
if ((type === NUMERIC || type === ALETTER) &&
|
||
|
(nextType === NUMERIC || nextType === ALETTER)) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// WB11. Don't break inside numeric sequences like "3.2" or
|
||
|
// "3,456.789".
|
||
|
if ((type === MIDNUM || type === MIDNUMLET) &&
|
||
|
nextType === NUMERIC &&
|
||
|
prevType === NUMERIC) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// WB12. Don't break inside numeric sequences like "3.2" or
|
||
|
// "3,456.789".
|
||
|
if (type === NUMERIC &&
|
||
|
(nextType === MIDNUM || nextType === MIDNUMLET) &&
|
||
|
nextNextType === NUMERIC) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// WB4. Ignore format and extend characters.
|
||
|
if (type === EXTEND || type === FORMAT ||
|
||
|
prevType === EXTEND || prevType === FORMAT ||
|
||
|
nextType === EXTEND || nextType === FORMAT) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// WB3. Don't break inside CRLF.
|
||
|
if (type === CR && nextType === LF) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// WB3a. Break before newlines (including CR and LF).
|
||
|
if (type === NEWLINE || type === CR || type === LF) {
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// WB3b. Break after newlines (including CR and LF).
|
||
|
if (nextType === NEWLINE || nextType === CR || nextType === LF) {
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// WB13. Don't break between Katakana characters.
|
||
|
if (type === KATAKANA && nextType === KATAKANA) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// WB13a. Don't break from extenders.
|
||
|
if (nextType === EXTENDNUMLET &&
|
||
|
(type === ALETTER || type === NUMERIC || type === KATAKANA ||
|
||
|
type === EXTENDNUMLET)) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// WB13b. Don't break from extenders.
|
||
|
if (type === EXTENDNUMLET &&
|
||
|
(nextType === ALETTER || nextType === NUMERIC ||
|
||
|
nextType === KATAKANA)) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// Break after any character not covered by the rules above.
|
||
|
return true;
|
||
|
}
|
||
|
};
|
||
|
|
||
|
Text.WordBreak = WordBreak;
|
||
|
|
||
|
|
||
|
}, '3.17.2', {"requires": ["array-extras", "text-data-wordbreak"]});
|