feat: 支持清洗表格标签 (#254)

hhhhsc701 · web-flow · commit 09a8f7564e90 · 2026-01-16T14:45:24.000+08:00
* feat: 支持清洗表格标签

* feat: 支持清洗表格标签
diff --git a/backend/shared/domain-common/src/main/java/com/datamate/common/setting/application/SysParamApplicationService.java b/backend/shared/domain-common/src/main/java/com/datamate/common/setting/application/SysParamApplicationService.java
@@ -12,6 +12,7 @@
 
 import java.util.Comparator;
 import java.util.List;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 /**
  * 系统参数应用服务
@@ -25,6 +26,7 @@
 public class SysParamApplicationService {
     private final SysParamRepository sysParamRepository;
     private final RedisClient redisClient;
+    private final AtomicBoolean redisEnable = new AtomicBoolean(true);
 
     /**
      * 列表查询系统参数
@@ -59,17 +61,18 @@ public void deleteParamById(String paramKey) {
     }
 
     public String getParamByKey(String paramId) {
-        boolean redisEnable = false;
         String value = null;
-        try {
-            value = redisClient.getParamWithThrow(paramId);
-            redisEnable = true;
-        } catch (Exception e) {
-            log.warn(e.getMessage());
+        if (redisEnable.get()) {
+            try {
+                value = redisClient.getParamWithThrow(paramId);
+            } catch (Exception e) {
+                redisEnable.set(false);
+                log.warn(e.getMessage());
+            }
         }
         if (value == null) {
             SysParam sysParam = sysParamRepository.getById(paramId);
-            if (sysParam != null && redisEnable) {
+            if (sysParam != null) {
                 value = sysParam.getParamValue();
             }
         }
diff --git a/frontend/src/pages/DataCleansing/Create/components/ParamConfig.tsx b/frontend/src/pages/DataCleansing/Create/components/ParamConfig.tsx
@@ -8,6 +8,7 @@ import {
   InputNumber,
   Slider,
   Space,
+  Switch,
 } from "antd";
 import { ConfigI, OperatorI } from "@/pages/OperatorMarket/operator.model";
 
@@ -215,12 +216,12 @@ const ParamConfig: React.FC<ParamConfigProps> = ({
           tooltip={param.description}
           key={paramKey}
         >
-          <Checkbox
-            checked={value as boolean}
-            onChange={(e) => updateValue(e.target.checked)}
-          >
-            {param.name}
-          </Checkbox>
+          <Switch
+            checkedChildren={param.checkedLabel}
+            unCheckedChildren={param.unCheckedLabel}
+            defaultChecked={param.defaultVal === 'true'}
+            onChange={(checked) => updateValue(checked)}
+          />
         </Form.Item>
       );
     case "multiple":
diff --git a/runtime/ops/filter/img_similar_images_cleaner/process.py b/runtime/ops/filter/img_similar_images_cleaner/process.py
@@ -139,8 +139,8 @@ def get_orb_similarity(self, des_matrix: np.ndarray, des_matrix_history: np.ndar
             orb_similarity = count / len(matches)
             return orb_similarity
         except Exception as e:
-            logger.exception(f"taskId: ｛self.task_uuid｝, failed to compare the similarity between "
-                             f"｛file_name｝ and ｛file_name_history｝: {e}")
+            logger.exception(f"taskId: {self.task_uuid}, failed to compare the similarity between "
+                             f"{file_name} and {file_name_history}: {e}")
             return 0.0
 
     def execute_sql(self, p_hash: str, des_matrix: np.ndarray, file_name: str,
diff --git a/runtime/ops/mapper/html_tag_cleaner/metadata.yml b/runtime/ops/mapper/html_tag_cleaner/metadata.yml
@@ -14,3 +14,12 @@ effect:
   after: '机器学习是人工智能的一个分支。'
 inputs: 'text'
 outputs: 'text'
+settings:
+  removeTableTags:
+    name: '是否去除表格标签'
+    description: '若为是，则会去除表格标签<tr><td>等。'
+    type: 'switch'
+    defaultVal: 'false'
+    required: false
+    checkedLabel: '是'
+    unCheckedLabel: '否'
diff --git a/runtime/ops/mapper/html_tag_cleaner/process.py b/runtime/ops/mapper/html_tag_cleaner/process.py
@@ -34,8 +34,14 @@ class HtmlTagCleaner(Mapper):
         '<sup>', '<template>', '<textarea>', '<tfoot>', '<thead>', '<time>', '<title>', '<track>', '<tt>', '<u>',
         '<ul>', '<var>', '<video>', '<wbr>', '<xmp>'
     ]
+    # 需要添加的表格标签
+    table_tags = ['<table>', '<tbody>', '<td>', '<th>', '<tr>']
     preserved_attr_list = ['colspan', 'rowspan']  # 需要保留的标签属性列表
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.remove_table_tags = bool(kwargs.get('removeTableTags', False))
+
     @staticmethod
     def _remove_specified_tags(input_data: str, specified_tags: List):
         """移除指定html标签及其属性值"""
@@ -68,13 +74,15 @@ def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
         if sample[self.filetype_key] != "xml":
             sample[self.text_key] = self._remove_html_tags(sample[self.text_key])
             logger.info(
-                f"fileName: ｛sample[self.filename_key]｝, method: HtmlTagCleaner costs {time.time() - start:6f} s")
+                f"fileName: {sample[self.filename_key]}, method: HtmlTagCleaner costs {time.time() - start:6f} s")
         else:
             logger.info(f"fileName: {sample[self.filename_key]}, method: HtmlTagCleaner, The file is xml!")
         return sample
 
     def _remove_html_tags(self, input_data: str):
-        # 去除常见的html标签及其属性值（不包括<table>、<tbody>、<tr>、<td>、<th>）
+        # 去除常见的html标签及其属性值
+        if self.remove_table_tags:
+            self.tag_list.extend(self.table_tags)
         cleaned_text = self._remove_specified_tags(input_data, self.tag_list)
         # 去除表格标签内的属性值（不包括colspan、rowspan属性），eg:<td class="td8" rowspan="3"> —> <td rowspan="3">
         cleaned_text = self._remove_tag_attributes(cleaned_text, self.preserved_attr_list)
diff --git a/runtime/ops/mapper/img_direction_correct/process.py b/runtime/ops/mapper/img_direction_correct/process.py
@@ -64,7 +64,7 @@ def _detect_direction(image, file_name, model):
         rotate_angle = int(cls_res.get("class_ids", np.array([0], dtype='int32')).item())
         pro = float(cls_res.get("scores", np.array([0], dtype='int32')).item())
         logger.info(
-            f"fileName: ｛file_name｝, model ｛model.model_name｝ detect result is {rotate_angle} with confidence ｛pro｝")
+            f"fileName: {file_name}, model {model.model_name} detect result is {rotate_angle} with confidence {pro}")
         if rotate_angle == 90 and pro > 0.89:
             return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
         if rotate_angle == 180 and pro > 0.89:
@@ -107,7 +107,7 @@ def execute(self, sample: Dict[str, Any]):
             data = bytes_transform.bytes_to_numpy(img_bytes)
             correct_data = self._img_direction_correct(data, file_name, self.model)
             sample[self.data_key] = bytes_transform.numpy_to_bytes(correct_data, file_type)
-            logger.info(f"fileName: ｛file_name｝, method: ImgDirectionCorrect costs {time.time() - start:6f} s")
+            logger.info(f"fileName: {file_name}, method: ImgDirectionCorrect costs {time.time() - start:6f} s")
         return sample
 
     def _img_direction_correct(self, img, file_name, standard_model):
diff --git a/runtime/ops/mapper/img_enhanced_saturation/process.py b/runtime/ops/mapper/img_enhanced_saturation/process.py
@@ -52,7 +52,7 @@ def enhance_saturation(self, image_data: np.ndarray, file_name):
 
         # 图片饱和度较高，不需要增强饱和度
         if saturation_factor <= 1:
-            logger.info(f"fileName: ｛file_name｝, method: ImgSaturation not need enhancement")
+            logger.info(f"fileName: {file_name}, method: ImgSaturation not need enhancement")
             return image_data
 
         # 计算图片红色通道均值， 如果过大，需要限制saturation factor大小，否则图片会泛红, 产生色彩畸变。
@@ -78,5 +78,5 @@ def execute(self, sample: Dict[str, Any]):
             img_data = bytes_transform.bytes_to_numpy(img_bytes)
             img_data = self.enhance_saturation(img_data, file_name)
             sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
-        logger.info(f"fileName: ｛file_name｝, method: ImgSaturation costs {time.time() - start:6f} s")
+        logger.info(f"fileName: {file_name}, method: ImgSaturation costs {time.time() - start:6f} s")
         return sample
diff --git a/runtime/ops/mapper/xml_tag_cleaner/process.py b/runtime/ops/mapper/xml_tag_cleaner/process.py
@@ -57,9 +57,9 @@ def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
         if sample[self.filetype_key] == "xml":
             try:
                 sample[self.text_key] = self._tag_clean_xml(sample[self.text_key])
-                logger.info(f"fileName: ｛file_name｝, method: XMLTagCleaner costs {time.time() - start:6f} s")
+                logger.info(f"fileName: {file_name}, method: XMLTagCleaner costs {time.time() - start:6f} s")
             except ExpatError as err:
-                logger.error(f"fileName: {file_name} is abnormal xml form: ｛err｝")
+                logger.error(f"fileName: {file_name} is abnormal xml form: {err}")
                 raise RuntimeError(81001, str(err)) from None
             except Exception as err:
                 logger.error(f"fileName {file_name}, method: XMLTagCleaner causes other error: {err}")
diff --git a/scripts/db/data-operator-init.sql b/scripts/db/data-operator-init.sql
@@ -134,7 +134,7 @@ ON CONFLICT DO NOTHING;
 
 INSERT INTO t_operator
 (id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
-VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API，抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
+VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API，抽取PDF中的文本。', '1.0.0', 'text', 'text', null, '{"exportType":{"name":"导出类型","description":"指定清洗结果文件类型。若指定为md且后续存在其他清洗算子，可能导致文件格式错乱。","type":"select","defaultVal":"markdown","required":false,"options":[{"label":"markdown","value":"md"},{"label":"txt","value":"txt"}]}}', '', false),
        ('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值，该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时，选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
        ('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值，该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
        ('FileWithHighSpecialCharRateFilter', '文档特殊字符率检查', '去除特殊字符过多的文档。', '1.0.0', 'text', 'text', null, '{"specialCharRatio": {"name": "文档特殊字符率", "description": "特殊字符的统计数/文档总字数 > 设定值，该文档被去除。", "type": "slider", "defaultVal": 0.3, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
@@ -148,7 +148,7 @@ VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API，抽取P
        ('ExtraSpaceCleaner', '多余空格去除', '移除文档首尾、句中或标点符号附近多余空格和 tab 等。', '1.0.0', 'text', 'text', null, null, '', 'false'),
        ('FullWidthCharacterCleaner', '全角转半角', '将文档中的所有全角字符转换成半角字符。', '1.0.0', 'text', 'text', null, null, '', 'false'),
        ('GrableCharactersCleaner', '文档乱码去除', '去除文档中的乱码和无意义的unicode。', '1.0.0', 'text', 'text', null, null, '', 'false'),
-       ('HtmlTagCleaner', 'HTML标签去除', '移除文档中HTML标签，如 <html>、<dev>、<p> 等。', '1.0.0', 'text', 'text', null, null, '', 'false'),
+       ('HtmlTagCleaner', 'HTML标签去除', '移除文档中HTML标签，如 <html>、<dev>、<p> 等。', '1.0.0', 'text', 'text', null, '{"removeTableTags":{"name":"是否去除表格标签","description":"若为是，则会去除表格标签<tr><td>等。","type":"switch","defaultVal":"false","required":false,"checkedLabel":"是","unCheckedLabel":"否"}}', '', 'false'),
        ('AnonymizedIdNumber', '身份证号匿名化', '身份证号匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
        ('InvisibleCharactersCleaner', '不可见字符去除', '去除文档中的不可见字符，例如 0-31 号字符中的部分字符。', '1.0.0', 'text', 'text', null, null, '', 'false'),
        ('AnonymizedIpAddress', 'IP地址匿名化', 'IP地址匿名化', '1.0.0', 'text', 'text', null, null, '', 'false'),