Indexer.php 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. #!/usr/bin/env php
  2. <?php
  3. /**
  4. * Xunsearch PHP-SDK 索引管理工具
  5. *
  6. * @author hightman
  7. * @link http://www.xunsearch.com/
  8. * @copyright Copyright &copy; 2011 HangZhou YunSheng Network Technology Co., Ltd.
  9. * @license http://www.xunsearch.com/license/
  10. * @version $Id$
  11. */
  12. require_once dirname(__FILE__) . '/../lib/XS.php';
  13. require_once dirname(__FILE__) . '/XSUtil.class.php';
  14. require_once dirname(__FILE__) . '/XSDataSource.class.php';
  15. // check arguments
  16. //ini_set('memory_limit', '1024M');
  17. XSUtil::parseOpt(array('p', 'c', 'd', 'project', 'charset', 'db', 'source', 'file', 'sql', 'csv-delimiter', 'add-synonym', 'del-synonym'));
  18. $project = XSUtil::getOpt('p', 'project', true);
  19. // magick output charset
  20. $charset = XSUtil::getOpt('c', 'charset');
  21. XSUtil::setCharset($charset);
  22. // long options
  23. $params = array('source', 'file', 'sql', 'rebuild', 'clean', 'flush', 'flush-log', 'info', 'csv-delimiter', 'filter');
  24. $params[] = 'add-synonym';
  25. $params[] = 'del-synonym';
  26. foreach ($params as $_)
  27. {
  28. $k = strtr($_, '-', '_');
  29. $$k = XSUtil::getOpt(null, $_);
  30. }
  31. // file & database
  32. $file = XSUtil::getOpt(null, 'file', true);
  33. $db = XSUtil::getOpt('d', 'db');
  34. // help message
  35. if (XSUtil::getOpt('h', 'help') !== null || !is_string($project)
  36. || (!$flush && !$flush_log && !$info && !$clean && !$source && !$add_synonym && !$del_synonym))
  37. {
  38. $version = PACKAGE_NAME . '/' . PACKAGE_VERSION;
  39. echo <<<EOF
  40. Indexer - 索引批量管理、导入工具 ($version)
  41. 用法
  42. {$_SERVER['argv'][0]} [options] [-p|--project] <project> [--file] <file>
  43. 选项说明
  44. --project=<name|ini>
  45. -p <project> 用于指定要搜索的项目名称或项目配置文件的路径,
  46. 如果指定的是名称,则使用 ../app/<name>.ini 作为配置文件
  47. --charset=<gbk|utf-8>
  48. -c <charset> 指定您当前在用以及数据源的字符集,以便系统进行智能转换(默认:UTF-8)
  49. --db=<name>
  50. -d <db> 指定项目中的数据库名称,默认是名为 db 的库
  51. --source=mysql://[user[:passwd]@]host/dbname[/table]
  52. 指定数据源为 mysql
  53. --source=sqlite:[//]<dbpath>|sqlite3:[//]<dbpath>
  54. 指定数据源为 sqlite 或 sqlite3
  55. --source=json指定数据源为 json 格式,每行一条记录
  56. --source=csv 指定数据源为 csv 格式,逗号分隔字段,每行一条记录,可在首行指定字段名
  57. --csv-delimiter[=,] 指定 csv 数据源的字段分割符,默认为逗号,支持 \\t\\r\\n..\\xNN
  58. 使用 \\ 开头及其它与 shell 有岐议的分割符时请使用引号包围。
  59. --file=<file>当数据源为 json 或 csv 格式时指定数据源文件,默认读取标准输入
  60. --sql=<sql> 当数据源为 sql 类型时指定 sql 搜索语句,默认情况下,
  61. 如果在 --source 包含 table 则载入该表数据。
  62. 警告:请勿在 SQL 语句中包含 `` 反引号,这在 SHELL 中有特殊函义可能会出错
  63. --filter <name|path>
  64. 指定数据过滤器,可为内置的 debug 或自定义的过滤器文件路径(不包含 .php)
  65. 过滤器必须实现接口 XSDataFilter
  66. --add-synonym=<raw1:synonym1[,raw2:synonym2]...>
  67. 添加一个或多个同义词, 多个之间用半角逗号分隔, 原词和同义词之间用冒号分隔
  68. --del-synonym=<raw1[:synonym1[,raw2[:synonym2]]]...>
  69. 删除一个或多个同义词, 多个之间用半角逗号分隔, 原词和同义词之间用冒号分隔
  70. 省略同义词则表示删除该原词的所有同义词
  71. --rebuild 使用平滑重建方式导入数据,必须与 --source 配合使用
  72. --clean 清空库内当前的索引数据
  73. --flush 强制提交刷新索引服务端的缓冲索引,与 --source 分开用
  74. --flush-log 强制提交刷新搜索日志,与 --source 分开用
  75. --info 查看当前索引库在服务端的信息(含数据缓冲、运行进程等)
  76. -h|--help 显示帮助信息
  77. EOF;
  78. exit(0);
  79. }
  80. // create xs project
  81. $ini = file_exists($project) ? $project : dirname(__FILE__) . '/../app/' . $project . '.ini';
  82. if (!file_exists($ini))
  83. {
  84. echo "错误:无效的项目名称 ($project),不存在相应的配置文件。\n";
  85. exit(-1);
  86. }
  87. // csv delimiter saved in super global variable: _SERVER
  88. if (is_string($csv_delimiter) && $source == 'csv')
  89. {
  90. if (substr($csv_delimiter, 0, 1) !== '\\')
  91. $csv_delimiter = substr($csv_delimiter, 0, 1);
  92. else
  93. {
  94. $char = substr($csv_delimiter, 1, 1);
  95. switch ($char)
  96. {
  97. case '\\':
  98. $csv_delimiter = '\\';
  99. break;
  100. case 't':
  101. $csv_delimiter = "\t";
  102. break;
  103. case 'x':
  104. $csv_delimiter = chr(hexdec(substr($csv_delimiter, 2)));
  105. break;
  106. default:
  107. $csv_delimiter = ',';
  108. break;
  109. }
  110. }
  111. $_SERVER['XS_CSV_DELIMITER'] = $csv_delimiter;
  112. printf("注意:CSV 字段分割符被修改为 `%c` (ASCII: 0x%02x)\n", ord($csv_delimiter), ord($csv_delimiter));
  113. }
  114. // filter
  115. if ($filter !== null && is_string($filter))
  116. {
  117. $original = $filter;
  118. $class = 'XS' . ucfirst(strtolower($filter)) . 'Filter';
  119. if (class_exists($class))
  120. $filter = new $class;
  121. else
  122. {
  123. if (file_exists($filter . '.php'))
  124. {
  125. $class = basename($filter);
  126. require_once $filter . '.php';
  127. if (class_exists($class))
  128. $filter = new $class;
  129. }
  130. }
  131. if (!is_object($filter) || !($filter instanceof XSDataFilter))
  132. {
  133. $filter = null;
  134. echo "注意:自动忽略无效的过滤器 [" . $original . "]\n";
  135. }
  136. }
  137. // execute the indexer
  138. try
  139. {
  140. // create xs object
  141. $xs = new XS($ini);
  142. $index = $xs->index;
  143. if ($db !== null)
  144. $index->setDb($db);
  145. // special actions
  146. if ($info !== null)
  147. {
  148. $res = $index->execCommand(CMD_INDEX_GET_DB);
  149. $res = json_decode($res->buf);
  150. echo "数据库名:" . $res->name . "\n";
  151. echo "队列数据:" . $res->count . "条\n";
  152. echo "导入进程:" . ($res->pid > 0 ? '#' . $res->pid : '无') . "\n";
  153. }
  154. else if ($flush_log !== null)
  155. {
  156. echo "刷新搜索日志 ... \n";
  157. if (($res = $index->flushLogging()) === false)
  158. echo "失败\n";
  159. else
  160. echo "成功,注意:后台更新需要一些时间,并不是真正立即完成。\n";
  161. }
  162. else if ($flush !== null)
  163. {
  164. echo "刷新索引缓冲 ... \n";
  165. if (($res = $index->flushIndex()) === false)
  166. echo "失败\n";
  167. else
  168. echo "成功,注意:后台更新需要一些时间,并不是真正立即完成。\n";
  169. }
  170. else
  171. {
  172. // clean
  173. if ($clean !== null)
  174. {
  175. echo "清空现有索引数据 ...\n";
  176. $index->clean();
  177. }
  178. // begin rebuild
  179. if ($rebuild !== null)
  180. {
  181. echo "开始重建索引 ...\n";
  182. $index->beginRebuild();
  183. }
  184. // import data from source
  185. $fid = $xs->getFieldId();
  186. if (!empty($source))
  187. {
  188. echo "初始化数据源 ... $source \n";
  189. $total = $total_ok = $total_failed = 0;
  190. $src = XSDataSource::instance($source, strpos($source, ':') ? $sql : $file);
  191. $dcs = $src->getCharset();
  192. if ($dcs === false)
  193. $dcs = $charset === null ? 'UTF-8' : $charset;
  194. $doc = new XSDocument($dcs);
  195. echo "开始批量导入数据 (" . (empty($file) ? "请直接输入数据" : $file) . ") ...\n";
  196. XSUtil::flush();
  197. $index->setTimeout(0);
  198. $index->openBuffer();
  199. while ($data = $src->getData())
  200. {
  201. if ($source == 'csv')
  202. {
  203. $data = csv_transform($data);
  204. if (is_null($data))
  205. continue;
  206. }
  207. $pk = $data[$fid->name];
  208. if ($filter !== null && ($data = $filter->process($data, $dcs)) === false)
  209. {
  210. $total++;
  211. echo "警告:过滤器忽略了第 $total 条数据, 主键为:" . $pk . "\n";
  212. continue;
  213. }
  214. $doc->setFields($data);
  215. try
  216. {
  217. $total++;
  218. $index->update($doc);
  219. $total_ok++;
  220. }
  221. catch (XSException $e)
  222. {
  223. echo "警告:添加第 $total 条数据失败 - " . $e->getMessage() . "\n";
  224. echo $e->getTraceAsString();
  225. $total_failed++;
  226. }
  227. $doc->setFields(null);
  228. if (($total % 10000) == 0)
  229. echo "报告:累计已处理数据 $total 条 ...\n";
  230. }
  231. $index->closeBuffer();
  232. echo "完成索引导入:成功 $total_ok 条,失败 $total_failed 条\n";
  233. }
  234. // add synonyms
  235. if (is_string($add_synonym))
  236. {
  237. $rec = array();
  238. foreach (explode(",", $add_synonym) as $tmp)
  239. {
  240. if (strpos($tmp, ':') === false)
  241. continue;
  242. list($raw, $syn) = explode(':', $tmp, 2);
  243. $raw = trim($raw);
  244. $syn = trim($syn);
  245. if ($raw !== '' && $syn !== '')
  246. $rec[] = array($raw, $syn);
  247. }
  248. echo "报告:开始添加同义词记录 " . count($rec) . " 条...\n";
  249. if (count($rec) > 1)
  250. $index->openBuffer();
  251. foreach ($rec as $tmp)
  252. {
  253. $index->addSynonym($tmp[0], $tmp[1]);
  254. }
  255. if (count($rec) > 1)
  256. $index->closeBuffer();
  257. }
  258. // del synonyms
  259. if (is_string($del_synonym))
  260. {
  261. $rec = array();
  262. foreach (explode(",", $del_synonym) as $tmp)
  263. {
  264. $syn = '';
  265. if (strpos($tmp, ':') === false)
  266. $raw = trim($tmp);
  267. else
  268. {
  269. list($raw, $syn) = explode(':', $tmp, 2);
  270. $raw = trim($raw);
  271. $syn = trim($syn);
  272. }
  273. if ($raw !== '')
  274. $rec[] = array($raw, $syn);
  275. }
  276. echo "报告:开始删除同义词记录 " . count($rec) . " 条...\n";
  277. if (count($rec) > 1)
  278. $index->openBuffer();
  279. foreach ($rec as $tmp)
  280. {
  281. $index->delSynonym($tmp[0], $tmp[1]);
  282. }
  283. if (count($rec) > 1)
  284. $index->closeBuffer();
  285. }
  286. // end rebuild
  287. if ($rebuild !== null)
  288. {
  289. echo "完成重建索引 ...\n";
  290. $index->endRebuild();
  291. }
  292. else
  293. {
  294. echo "刷新索引提交 ...\n";
  295. $index->flushIndex();
  296. }
  297. }
  298. }
  299. catch (XSException $e)
  300. {
  301. // Exception
  302. $start = dirname(dirname(__FILE__));
  303. $relative = XSException::getRelPath($start);
  304. $traceString = $e->getTraceAsString();
  305. $traceString = str_replace(dirname(__FILE__) . '/', '', $traceString);
  306. $traceString = str_replace($start . ($relative === '' ? '/' : ''), $relative, $traceString);
  307. echo $e . "\n" . $traceString . "\n";
  308. }
  309. // translate csv data
  310. function csv_transform($data)
  311. {
  312. static $fields = null;
  313. global $xs; /* @var $xs XS */
  314. // init field set
  315. if (is_null($fields))
  316. {
  317. // load default fields
  318. $fields = array_keys($xs->getScheme()->getAllFields());
  319. // check data is fieldset or not
  320. $is_header = true;
  321. foreach ($data as $tmp)
  322. {
  323. if (!in_array($tmp, $fields))
  324. {
  325. $is_header = false;
  326. break;
  327. }
  328. }
  329. if ($is_header)
  330. {
  331. $fields = $data;
  332. echo "注意:CSV 数据字段被指定为:" . implode(',', $data) . "\n";
  333. return null;
  334. }
  335. }
  336. // transform
  337. $ret = array();
  338. foreach ($fields as $key)
  339. {
  340. $index = count($ret);
  341. if (!isset($data[$index]))
  342. break;
  343. $ret[$key] = $data[$index];
  344. }
  345. return $ret;
  346. }