123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139 |
- const { Segment, Tokenizer, dicts, synonyms, stopwords, modules } = require('segmentit');
- const scWords = ['硅酸盐水泥', '综合工', '铜芯', '螺纹钢', 'mm2', 'mm'];
- /* const buffer = readFileSync(join(__dirname, '../dict/scWords.utf8'));
- const dicText = buffer.toString();
- export const scWords = dicText.split(/\r?\n/); */
- class CusTokenizer extends Tokenizer {
- segment;
- split(words) {
- // 需要的话可以获取到 this.segment 里的各种信息
- // const TABLE = this.segment.getDict('TABLE');
- const ret = [];
- // 这个拦截器放在第一位,优先级最高words 一般只有一个元素
- for (const word of words) {
- let source = word.w;
- for (const scWord of scWords) {
- if (source.includes(scWord)) {
- // ret.push({ w: scWord, p: TABLE[scWord].p });
- ret.push({ w: scWord, p: 1000 }); // 这个p 表示词性,有值时,后一个分词器就不会再对这个词进行处理了, 对于我们使用场景来说没什么用,给个默认值就好
- const reg = new RegExp(`${scWord}`, 'g');
- source = source.replace(reg, '');
- }
- }
- if (source) ret.push({ w: source });
- }
- return ret;
- }
- }
- const useDefault = (segmentInstance) => {
- segmentInstance.use([CusTokenizer, ...modules]);
- segmentInstance.loadDict(dicts); // dicText,
- segmentInstance.loadSynonymDict(synonyms);
- segmentInstance.loadStopwordDict(stopwords);
- return segmentInstance;
- };
- const segmentit = useDefault(new Segment());
- const scCut = (text) => {
- const options = { stripPunctuation: true, simple: true };
- return segmentit.doSegment(text, options);
- };
- // 获取分词数组。
- const getCutWords = (aName, aSpecs) => {
- const cutNames = scCut(aName);
- const cutSpecs = scCut(aSpecs); // 规格字符混杂,分词效果极差,使用 cutHMM()效果略好。
- const rst = [...cutNames, ...cutSpecs];
- return rst;
- }
- // 返回匹配数。
- const getMatchCount = (aUserWords, aMatchStr) => {
- let count = 0;
- for (const word of aUserWords) {
- if (aMatchStr.indexOf(word) > -1) count += 1;
- }
- return count;
- }
- // 分词算法相关:别名替换。返回替换后的新字符串。
- const alias = (aStr) => {
- if (!aStr) {
- return aStr;
- }
- // 替换前
- const a1 = ['φ', '混凝土'];
- // 替换后,标准。用户自定义词库以a2中的词为标准录入。
- const a2 = ['Ф', '砼'];
- for (const key in a1) {
- const patt = new RegExp(a1[key], 'g');
- aStr = aStr.replace(patt, a2[key]);
- }
- return aStr;
- }
- const handelThreeWord = (word) => {
- function getArr(tem, list) {
- const nameArray = scCut(tem);
- // 说明是一个词
- if (nameArray.length === 1) list.push(tem);
- }
- const arr = [];
- getArr(word[0] + word[1], arr);
- getArr(word[1] + word[2], arr);
- if (arr.length > 0) return arr;
- return [word];
- }
- // 自定义特殊处理
- const cusSegment = (nameArray, keyword) => {
- const temArr = [];
- for (let a of nameArray) {
- // 混凝土 和 砼 统一认成砼
- if (a === '混凝土') a = '砼';
- if (a === '砼' || a === '砖') {
- temArr.push(a);
- } else if (a.length > 1) {
- // 如果是三个字的词,优先识别成两个字
- if (a.length === 3 && !scWords.includes(a)) {
- const sArr = handelThreeWord(a);
- temArr.push(...sArr);
- } else {
- temArr.push(a);
- }
- }
- }
- if (keyword.length === 1 && temArr.length === 0) temArr.push(keyword);
- return temArr;
- }
- const getWordArray = (keyword) => {
- let wordArray = [];
- if (keyword.length < 3) {
- // 小于3个字的直接按一个词处理
- wordArray.push(keyword);
- } else {
- wordArray = scCut(keyword);
- }
- // 自定义分词特殊处理
- wordArray = cusSegment(wordArray, keyword);
- // console.log(`分词结果:${wordArray}`);
- return wordArray;
- }
- module.exports = {
- scCut,
- getCutWords,
- getMatchCount,
- alias,
- getWordArray,
- };
|