Skip to content

Commit 09a8f75

Browse files
authored
feat: 支持清洗表格标签 (#254)
* feat: 支持清洗表格标签 * feat: 支持清洗表格标签
1 parent a788c7e commit 09a8f75

File tree

9 files changed

+46
-25
lines changed

9 files changed

+46
-25
lines changed

backend/shared/domain-common/src/main/java/com/datamate/common/setting/application/SysParamApplicationService.java

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import java.util.Comparator;
1414
import java.util.List;
15+
import java.util.concurrent.atomic.AtomicBoolean;
1516

1617
/**
1718
* 系统参数应用服务
@@ -25,6 +26,7 @@
2526
public class SysParamApplicationService {
2627
private final SysParamRepository sysParamRepository;
2728
private final RedisClient redisClient;
29+
private final AtomicBoolean redisEnable = new AtomicBoolean(true);
2830

2931
/**
3032
* 列表查询系统参数
@@ -59,17 +61,18 @@ public void deleteParamById(String paramKey) {
5961
}
6062

6163
public String getParamByKey(String paramId) {
62-
boolean redisEnable = false;
6364
String value = null;
64-
try {
65-
value = redisClient.getParamWithThrow(paramId);
66-
redisEnable = true;
67-
} catch (Exception e) {
68-
log.warn(e.getMessage());
65+
if (redisEnable.get()) {
66+
try {
67+
value = redisClient.getParamWithThrow(paramId);
68+
} catch (Exception e) {
69+
redisEnable.set(false);
70+
log.warn(e.getMessage());
71+
}
6972
}
7073
if (value == null) {
7174
SysParam sysParam = sysParamRepository.getById(paramId);
72-
if (sysParam != null && redisEnable) {
75+
if (sysParam != null) {
7376
value = sysParam.getParamValue();
7477
}
7578
}

frontend/src/pages/DataCleansing/Create/components/ParamConfig.tsx

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import {
88
InputNumber,
99
Slider,
1010
Space,
11+
Switch,
1112
} from "antd";
1213
import { ConfigI, OperatorI } from "@/pages/OperatorMarket/operator.model";
1314

@@ -215,12 +216,12 @@ const ParamConfig: React.FC<ParamConfigProps> = ({
215216
tooltip={param.description}
216217
key={paramKey}
217218
>
218-
<Checkbox
219-
checked={value as boolean}
220-
onChange={(e) => updateValue(e.target.checked)}
221-
>
222-
{param.name}
223-
</Checkbox>
219+
<Switch
220+
checkedChildren={param.checkedLabel}
221+
unCheckedChildren={param.unCheckedLabel}
222+
defaultChecked={param.defaultVal === 'true'}
223+
onChange={(checked) => updateValue(checked)}
224+
/>
224225
</Form.Item>
225226
);
226227
case "multiple":

runtime/ops/filter/img_similar_images_cleaner/process.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,8 @@ def get_orb_similarity(self, des_matrix: np.ndarray, des_matrix_history: np.ndar
139139
orb_similarity = count / len(matches)
140140
return orb_similarity
141141
except Exception as e:
142-
logger.exception(f"taskId: self.task_uuid, failed to compare the similarity between "
143-
f"file_name and file_name_history: {e}")
142+
logger.exception(f"taskId: {self.task_uuid}, failed to compare the similarity between "
143+
f"{file_name} and {file_name_history}: {e}")
144144
return 0.0
145145

146146
def execute_sql(self, p_hash: str, des_matrix: np.ndarray, file_name: str,

runtime/ops/mapper/html_tag_cleaner/metadata.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,12 @@ effect:
1414
after: '机器学习是人工智能的一个分支。'
1515
inputs: 'text'
1616
outputs: 'text'
17+
settings:
18+
removeTableTags:
19+
name: '是否去除表格标签'
20+
description: '若为是,则会去除表格标签<tr><td>等。'
21+
type: 'switch'
22+
defaultVal: 'false'
23+
required: false
24+
checkedLabel: ''
25+
unCheckedLabel: ''

runtime/ops/mapper/html_tag_cleaner/process.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,14 @@ class HtmlTagCleaner(Mapper):
3434
'<sup>', '<template>', '<textarea>', '<tfoot>', '<thead>', '<time>', '<title>', '<track>', '<tt>', '<u>',
3535
'<ul>', '<var>', '<video>', '<wbr>', '<xmp>'
3636
]
37+
# 需要添加的表格标签
38+
table_tags = ['<table>', '<tbody>', '<td>', '<th>', '<tr>']
3739
preserved_attr_list = ['colspan', 'rowspan'] # 需要保留的标签属性列表
3840

41+
def __init__(self, *args, **kwargs):
42+
super().__init__(*args, **kwargs)
43+
self.remove_table_tags = bool(kwargs.get('removeTableTags', False))
44+
3945
@staticmethod
4046
def _remove_specified_tags(input_data: str, specified_tags: List):
4147
"""移除指定html标签及其属性值"""
@@ -68,13 +74,15 @@ def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
6874
if sample[self.filetype_key] != "xml":
6975
sample[self.text_key] = self._remove_html_tags(sample[self.text_key])
7076
logger.info(
71-
f"fileName: sample[self.filename_key], method: HtmlTagCleaner costs {time.time() - start:6f} s")
77+
f"fileName: {sample[self.filename_key]}, method: HtmlTagCleaner costs {time.time() - start:6f} s")
7278
else:
7379
logger.info(f"fileName: {sample[self.filename_key]}, method: HtmlTagCleaner, The file is xml!")
7480
return sample
7581

7682
def _remove_html_tags(self, input_data: str):
77-
# 去除常见的html标签及其属性值(不包括<table>、<tbody>、<tr>、<td>、<th>)
83+
# 去除常见的html标签及其属性值
84+
if self.remove_table_tags:
85+
self.tag_list.extend(self.table_tags)
7886
cleaned_text = self._remove_specified_tags(input_data, self.tag_list)
7987
# 去除表格标签内的属性值(不包括colspan、rowspan属性),eg:<td class="td8" rowspan="3"> —> <td rowspan="3">
8088
cleaned_text = self._remove_tag_attributes(cleaned_text, self.preserved_attr_list)

runtime/ops/mapper/img_direction_correct/process.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def _detect_direction(image, file_name, model):
6464
rotate_angle = int(cls_res.get("class_ids", np.array([0], dtype='int32')).item())
6565
pro = float(cls_res.get("scores", np.array([0], dtype='int32')).item())
6666
logger.info(
67-
f"fileName: file_name, model model.model_name detect result is {rotate_angle} with confidence pro")
67+
f"fileName: {file_name}, model {model.model_name} detect result is {rotate_angle} with confidence {pro}")
6868
if rotate_angle == 90 and pro > 0.89:
6969
return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
7070
if rotate_angle == 180 and pro > 0.89:
@@ -107,7 +107,7 @@ def execute(self, sample: Dict[str, Any]):
107107
data = bytes_transform.bytes_to_numpy(img_bytes)
108108
correct_data = self._img_direction_correct(data, file_name, self.model)
109109
sample[self.data_key] = bytes_transform.numpy_to_bytes(correct_data, file_type)
110-
logger.info(f"fileName: file_name, method: ImgDirectionCorrect costs {time.time() - start:6f} s")
110+
logger.info(f"fileName: {file_name}, method: ImgDirectionCorrect costs {time.time() - start:6f} s")
111111
return sample
112112

113113
def _img_direction_correct(self, img, file_name, standard_model):

runtime/ops/mapper/img_enhanced_saturation/process.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def enhance_saturation(self, image_data: np.ndarray, file_name):
5252

5353
# 图片饱和度较高,不需要增强饱和度
5454
if saturation_factor <= 1:
55-
logger.info(f"fileName: file_name, method: ImgSaturation not need enhancement")
55+
logger.info(f"fileName: {file_name}, method: ImgSaturation not need enhancement")
5656
return image_data
5757

5858
# 计算图片红色通道均值, 如果过大,需要限制saturation factor大小,否则图片会泛红, 产生色彩畸变。
@@ -78,5 +78,5 @@ def execute(self, sample: Dict[str, Any]):
7878
img_data = bytes_transform.bytes_to_numpy(img_bytes)
7979
img_data = self.enhance_saturation(img_data, file_name)
8080
sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
81-
logger.info(f"fileName: file_name, method: ImgSaturation costs {time.time() - start:6f} s")
81+
logger.info(f"fileName: {file_name}, method: ImgSaturation costs {time.time() - start:6f} s")
8282
return sample

runtime/ops/mapper/xml_tag_cleaner/process.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@ def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
5757
if sample[self.filetype_key] == "xml":
5858
try:
5959
sample[self.text_key] = self._tag_clean_xml(sample[self.text_key])
60-
logger.info(f"fileName: file_name, method: XMLTagCleaner costs {time.time() - start:6f} s")
60+
logger.info(f"fileName: {file_name}, method: XMLTagCleaner costs {time.time() - start:6f} s")
6161
except ExpatError as err:
62-
logger.error(f"fileName: {file_name} is abnormal xml form: err")
62+
logger.error(f"fileName: {file_name} is abnormal xml form: {err}")
6363
raise RuntimeError(81001, str(err)) from None
6464
except Exception as err:
6565
logger.error(f"fileName {file_name}, method: XMLTagCleaner causes other error: {err}")

scripts/db/data-operator-init.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ ON CONFLICT DO NOTHING;
134134

135135
INSERT INTO t_operator
136136
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
137-
VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
137+
VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, '{"exportType":{"name":"导出类型","description":"指定清洗结果文件类型。若指定为md且后续存在其他清洗算子,可能导致文件格式错乱。","type":"select","defaultVal":"markdown","required":false,"options":[{"label":"markdown","value":"md"},{"label":"txt","value":"txt"}]}}', '', false),
138138
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
139139
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
140140
('FileWithHighSpecialCharRateFilter', '文档特殊字符率检查', '去除特殊字符过多的文档。', '1.0.0', 'text', 'text', null, '{"specialCharRatio": {"name": "文档特殊字符率", "description": "特殊字符的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.3, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
@@ -148,7 +148,7 @@ VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取P
148148
('ExtraSpaceCleaner', '多余空格去除', '移除文档首尾、句中或标点符号附近多余空格和 tab 等。', '1.0.0', 'text', 'text', null, null, '', 'false'),
149149
('FullWidthCharacterCleaner', '全角转半角', '将文档中的所有全角字符转换成半角字符。', '1.0.0', 'text', 'text', null, null, '', 'false'),
150150
('GrableCharactersCleaner', '文档乱码去除', '去除文档中的乱码和无意义的unicode。', '1.0.0', 'text', 'text', null, null, '', 'false'),
151-
('HtmlTagCleaner', 'HTML标签去除', '移除文档中HTML标签,如 <html>、<dev>、<p> 等。', '1.0.0', 'text', 'text', null, null, '', 'false'),
151+
('HtmlTagCleaner', 'HTML标签去除', '移除文档中HTML标签,如 <html>、<dev>、<p> 等。', '1.0.0', 'text', 'text', null, '{"removeTableTags":{"name":"是否去除表格标签","description":"若为是,则会去除表格标签<tr><td>等。","type":"switch","defaultVal":"false","required":false,"checkedLabel":"是","unCheckedLabel":"否"}}', '', 'false'),
152152
('AnonymizedIdNumber', '身份证号匿名化', '身份证号匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
153153
('InvisibleCharactersCleaner', '不可见字符去除', '去除文档中的不可见字符,例如 0-31 号字符中的部分字符。', '1.0.0', 'text', 'text', null, null, '', 'false'),
154154
('AnonymizedIpAddress', 'IP地址匿名化', 'IP地址匿名化', '1.0.0', 'text', 'text', null, null, '', 'false'),

0 commit comments

Comments
 (0)