| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139 | const { Segment, Tokenizer, dicts, synonyms, stopwords, modules } = require('segmentit');const scWords = ['硅酸盐水泥', '综合工', '铜芯', '螺纹钢', 'mm2', 'mm'];/* const buffer = readFileSync(join(__dirname, '../dict/scWords.utf8'));const dicText = buffer.toString();export const scWords = dicText.split(/\r?\n/); */class CusTokenizer extends Tokenizer {  segment;  split(words) {    // 需要的话可以获取到 this.segment 里的各种信息    // const TABLE = this.segment.getDict('TABLE');    const ret = [];    // 这个拦截器放在第一位,优先级最高words 一般只有一个元素    for (const word of words) {      let source = word.w;      for (const scWord of scWords) {        if (source.includes(scWord)) {          // ret.push({ w: scWord, p: TABLE[scWord].p });          ret.push({ w: scWord, p: 1000 }); // 这个p 表示词性,有值时,后一个分词器就不会再对这个词进行处理了, 对于我们使用场景来说没什么用,给个默认值就好          const reg = new RegExp(`${scWord}`, 'g');          source = source.replace(reg, '');        }      }      if (source) ret.push({ w: source });    }    return ret;  }}const useDefault = (segmentInstance) => {  segmentInstance.use([CusTokenizer, ...modules]);  segmentInstance.loadDict(dicts); // dicText,  segmentInstance.loadSynonymDict(synonyms);  segmentInstance.loadStopwordDict(stopwords);  return segmentInstance;};const segmentit = useDefault(new Segment());const scCut = (text) => {  const options = { stripPunctuation: true, simple: true };  return segmentit.doSegment(text, options);};// 获取分词数组。const getCutWords = (aName, aSpecs) => {  const cutNames = scCut(aName);  const cutSpecs = scCut(aSpecs); // 规格字符混杂,分词效果极差,使用 cutHMM()效果略好。  const rst = [...cutNames, ...cutSpecs];  return rst;}// 返回匹配数。const getMatchCount = (aUserWords, aMatchStr) => {  let count = 0;  for (const word of aUserWords) {    if (aMatchStr.indexOf(word) > -1) count += 1;  }  return count;}// 分词算法相关:别名替换。返回替换后的新字符串。const alias = (aStr) => {  if (!aStr) {    return aStr;  }  // 替换前  const a1 = ['φ', '混凝土'];  // 替换后,标准。用户自定义词库以a2中的词为标准录入。  const a2 = ['Ф', '砼'];  for (const key in a1) {    const patt = new RegExp(a1[key], 'g');    aStr = aStr.replace(patt, a2[key]);  }  return aStr;}const handelThreeWord = (word) => {  function getArr(tem, list) {    const nameArray = scCut(tem);    // 说明是一个词    if (nameArray.length === 1) list.push(tem);  }  const arr = [];  getArr(word[0] + word[1], arr);  getArr(word[1] + word[2], arr);  if (arr.length > 0) return arr;  return [word];}// 自定义特殊处理const cusSegment = (nameArray, keyword) => {  const temArr = [];  for (let a of nameArray) {    // 混凝土 和 砼 统一认成砼    if (a === '混凝土') a = '砼';    if (a === '砼' || a === '砖') {      temArr.push(a);    } else if (a.length > 1) {      // 如果是三个字的词,优先识别成两个字      if (a.length === 3 && !scWords.includes(a)) {        const sArr = handelThreeWord(a);        temArr.push(...sArr);      } else {        temArr.push(a);      }    }  }  if (keyword.length === 1 && temArr.length === 0) temArr.push(keyword);  return temArr;}const getWordArray = (keyword) => {  let wordArray = [];  if (keyword.length < 3) {    // 小于3个字的直接按一个词处理    wordArray.push(keyword);  } else {    wordArray = scCut(keyword);  }  // 自定义分词特殊处理  wordArray = cusSegment(wordArray, keyword);  // console.log(`分词结果:${wordArray}`);  return wordArray;}module.exports = {  scCut,  getCutWords,  getMatchCount,  alias,  getWordArray,};
 |