segmentit.js 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. const { Segment, Tokenizer, dicts, synonyms, stopwords, modules } = require('segmentit');
  2. const scWords = ['硅酸盐水泥', '综合工', '铜芯', '螺纹钢', 'mm2', 'mm'];
  3. /* const buffer = readFileSync(join(__dirname, '../dict/scWords.utf8'));
  4. const dicText = buffer.toString();
  5. export const scWords = dicText.split(/\r?\n/); */
  6. class CusTokenizer extends Tokenizer {
  7. segment;
  8. split(words) {
  9. // 需要的话可以获取到 this.segment 里的各种信息
  10. // const TABLE = this.segment.getDict('TABLE');
  11. const ret = [];
  12. // 这个拦截器放在第一位,优先级最高words 一般只有一个元素
  13. for (const word of words) {
  14. let source = word.w;
  15. for (const scWord of scWords) {
  16. if (source.includes(scWord)) {
  17. // ret.push({ w: scWord, p: TABLE[scWord].p });
  18. ret.push({ w: scWord, p: 1000 }); // 这个p 表示词性,有值时,后一个分词器就不会再对这个词进行处理了, 对于我们使用场景来说没什么用,给个默认值就好
  19. const reg = new RegExp(`${scWord}`, 'g');
  20. source = source.replace(reg, '');
  21. }
  22. }
  23. if (source) ret.push({ w: source });
  24. }
  25. return ret;
  26. }
  27. }
  28. const useDefault = (segmentInstance) => {
  29. segmentInstance.use([CusTokenizer, ...modules]);
  30. segmentInstance.loadDict(dicts); // dicText,
  31. segmentInstance.loadSynonymDict(synonyms);
  32. segmentInstance.loadStopwordDict(stopwords);
  33. return segmentInstance;
  34. };
  35. const segmentit = useDefault(new Segment());
  36. const scCut = (text) => {
  37. const options = { stripPunctuation: true, simple: true };
  38. return segmentit.doSegment(text, options);
  39. };
  40. // 获取分词数组。
  41. const getCutWords = (aName, aSpecs) => {
  42. const cutNames = scCut(aName);
  43. const cutSpecs = scCut(aSpecs); // 规格字符混杂,分词效果极差,使用 cutHMM()效果略好。
  44. const rst = [...cutNames, ...cutSpecs];
  45. return rst;
  46. }
  47. // 返回匹配数。
  48. const getMatchCount = (aUserWords, aMatchStr) => {
  49. let count = 0;
  50. for (const word of aUserWords) {
  51. if (aMatchStr.indexOf(word) > -1) count += 1;
  52. }
  53. return count;
  54. }
  55. // 分词算法相关:别名替换。返回替换后的新字符串。
  56. const alias = (aStr) => {
  57. if (!aStr) {
  58. return aStr;
  59. }
  60. // 替换前
  61. const a1 = ['φ', '混凝土'];
  62. // 替换后,标准。用户自定义词库以a2中的词为标准录入。
  63. const a2 = ['Ф', '砼'];
  64. for (const key in a1) {
  65. const patt = new RegExp(a1[key], 'g');
  66. aStr = aStr.replace(patt, a2[key]);
  67. }
  68. return aStr;
  69. }
  70. const handelThreeWord = (word) => {
  71. function getArr(tem, list) {
  72. const nameArray = scCut(tem);
  73. // 说明是一个词
  74. if (nameArray.length === 1) list.push(tem);
  75. }
  76. const arr = [];
  77. getArr(word[0] + word[1], arr);
  78. getArr(word[1] + word[2], arr);
  79. if (arr.length > 0) return arr;
  80. return [word];
  81. }
  82. // 自定义特殊处理
  83. const cusSegment = (nameArray, keyword) => {
  84. const temArr = [];
  85. for (let a of nameArray) {
  86. // 混凝土 和 砼 统一认成砼
  87. if (a === '混凝土') a = '砼';
  88. if (a === '砼' || a === '砖') {
  89. temArr.push(a);
  90. } else if (a.length > 1) {
  91. // 如果是三个字的词,优先识别成两个字
  92. if (a.length === 3 && !scWords.includes(a)) {
  93. const sArr = handelThreeWord(a);
  94. temArr.push(...sArr);
  95. } else {
  96. temArr.push(a);
  97. }
  98. }
  99. }
  100. if (keyword.length === 1 && temArr.length === 0) temArr.push(keyword);
  101. return temArr;
  102. }
  103. const getWordArray = (keyword) => {
  104. let wordArray = [];
  105. if (keyword.length < 3) {
  106. // 小于3个字的直接按一个词处理
  107. wordArray.push(keyword);
  108. } else {
  109. wordArray = scCut(keyword);
  110. }
  111. // 自定义分词特殊处理
  112. wordArray = cusSegment(wordArray, keyword);
  113. // console.log(`分词结果:${wordArray}`);
  114. return wordArray;
  115. }
  116. module.exports = {
  117. scCut,
  118. getCutWords,
  119. getMatchCount,
  120. alias,
  121. getWordArray,
  122. };