SmartCost
/
YangHuOperation


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
							const { Segment, Tokenizer, dicts, synonyms, stopwords, modules } = require('segmentit');

const scWords = ['硅酸盐水泥', '综合工', '铜芯', '螺纹钢', 'mm2', 'mm'];

/* const buffer = readFileSync(join(__dirname, '../dict/scWords.utf8'));
const dicText = buffer.toString();
export const scWords = dicText.split(/\r?\n/); */

class CusTokenizer extends Tokenizer {
  segment;

  split(words) {
    // 需要的话可以获取到 this.segment 里的各种信息
    // const TABLE = this.segment.getDict('TABLE');
    const ret = [];
    // 这个拦截器放在第一位，优先级最高words 一般只有一个元素
    for (const word of words) {
      let source = word.w;
      for (const scWord of scWords) {
        if (source.includes(scWord)) {
          // ret.push({ w: scWord, p: TABLE[scWord].p });
          ret.push({ w: scWord, p: 1000 }); // 这个p 表示词性，有值时，后一个分词器就不会再对这个词进行处理了， 对于我们使用场景来说没什么用，给个默认值就好
          const reg = new RegExp(`${scWord}`, 'g');
          source = source.replace(reg, '');
        }
      }
      if (source) ret.push({ w: source });
    }

    return ret;
  }
}

const useDefault = (segmentInstance) => {
  segmentInstance.use([CusTokenizer, ...modules]);
  segmentInstance.loadDict(dicts); // dicText,
  segmentInstance.loadSynonymDict(synonyms);
  segmentInstance.loadStopwordDict(stopwords);
  return segmentInstance;
};

const segmentit = useDefault(new Segment());

const scCut = (text) => {
  const options = { stripPunctuation: true, simple: true };
  return segmentit.doSegment(text, options);
};

// 获取分词数组。
const getCutWords = (aName, aSpecs) => {
  const cutNames = scCut(aName);
  const cutSpecs = scCut(aSpecs); // 规格字符混杂，分词效果极差，使用 cutHMM()效果略好。
  const rst = [...cutNames, ...cutSpecs];
  return rst;
}

// 返回匹配数。
const getMatchCount = (aUserWords, aMatchStr) => {
  let count = 0;
  for (const word of aUserWords) {
    if (aMatchStr.indexOf(word) > -1) count += 1;
  }
  return count;
}

// 分词算法相关：别名替换。返回替换后的新字符串。
const alias = (aStr) => {
  if (!aStr) {
    return aStr;
  }
  // 替换前
  const a1 = ['φ', '混凝土'];
  // 替换后，标准。用户自定义词库以a2中的词为标准录入。
  const a2 = ['Ф', '砼'];
  for (const key in a1) {
    const patt = new RegExp(a1[key], 'g');
    aStr = aStr.replace(patt, a2[key]);
  }
  return aStr;
}

const handelThreeWord = (word) => {
  function getArr(tem, list) {
    const nameArray = scCut(tem);
    // 说明是一个词
    if (nameArray.length === 1) list.push(tem);
  }

  const arr = [];
  getArr(word[0] + word[1], arr);
  getArr(word[1] + word[2], arr);
  if (arr.length > 0) return arr;
  return [word];
}

// 自定义特殊处理
const cusSegment = (nameArray, keyword) => {
  const temArr = [];
  for (let a of nameArray) {
    // 混凝土 和 砼 统一认成砼
    if (a === '混凝土') a = '砼';
    if (a === '砼' || a === '砖') {
      temArr.push(a);
    } else if (a.length > 1) {
      // 如果是三个字的词，优先识别成两个字
      if (a.length === 3 && !scWords.includes(a)) {
        const sArr = handelThreeWord(a);
        temArr.push(...sArr);
      } else {
        temArr.push(a);
      }
    }
  }
  if (keyword.length === 1 && temArr.length === 0) temArr.push(keyword);
  return temArr;
}

const getWordArray = (keyword) => {
  let wordArray = [];
  if (keyword.length < 3) {
    // 小于3个字的直接按一个词处理
    wordArray.push(keyword);
  } else {
    wordArray = scCut(keyword);
  }

  // 自定义分词特殊处理
  wordArray = cusSegment(wordArray, keyword);
  // console.log(`分词结果：${wordArray}`);
  return wordArray;
}

module.exports = {
  scCut,
  getCutWords,
  getMatchCount,
  alias,
  getWordArray,
};