General News Extractor Source Code Reading
新闻源分类
按采集方式
自动解析
人工配置XPath
配置列表
配置详情
GNE新闻内容解析器
基础解析器
核心原理是总结了常见新闻要素的正则表达式模式
AuthorExtarctor
ContentExtractor
ListExtarctor
MetaExtractor
TimeExtractor
TitleExtractor
AuthorExtarctor
AUTHOR_PATTERN = [
"责编[:|:| |丨|/]\s*([\u4E00-\u9FA5a-zA-Z]{2,20})[^\u4E00-\u9FA5|:|:]",
"责任编辑[:|:| |丨|/]\s*([\u4E00-\u9FA5a-zA-Z]{2,20})[^\u4E00-\u9FA5|:|:]",
"作者[:|:| |丨|/]\s*([\u4E00-\u9FA5a-zA-Z]{2,20})[^\u4E00-\u9FA5|:|:]",
"编辑[:|:| |丨|/]\s*([\u4E00-\u9FA5a-zA-Z]{2,20})[^\u4E00-\u9FA5|:|:]",
"文[:|:| |丨|/]\s*([\u4E00-\u9FA5a-zA-Z]{2,20})[^\u4E00-\u9FA5|:|:]",
"原创[:|:| |丨|/]\s*([\u4E00-\u9FA5a-zA-Z]{2,20})[^\u4E00-\u9FA5|:|:]",
"撰文[:|:| |丨|/]\s*([\u4E00-\u9FA5a-zA-Z]{2,20})[^\u4E00-\u9FA5|:|:]",
"来源[:|:| |丨|/]\s*([\u4E00-\u9FA5a-zA-Z]{2,20})[^\u4E00-\u9FA5|:|:|<]",
# 以下正则表达式需要进一步测试
# '(作者[:|:| |丨|/]\s*[\u4E00-\u9FA5a-zA-Z、 ]{2,20})[)】)]]?[^\u4E00-\u9FA5|:|:]',
# '(记者[:|:| |丨|/]\s*[\u4E00-\u9FA5a-zA-Z、 ]{2,20})[)】)]]?[^\u4E00-\u9FA5|:|:]',
# '(原创[:|:| |丨|/]\s*[\u4E00-\u9FA5a-zA-Z、 ]{2,20})[)】)]]?[^\u4E00-\u9FA5|:|:]',
# '(撰文[:|:| |丨|/]\s*[\u4E00-\u9FA5a-zA-Z、 ]{2,20})[)】)]]?[^\u4E00-\u9FA5|:|:]',
# '(文/图[:|:| |丨|/]?\s*[\u4E00-\u9FA5a-zA-Z、 ]{2,20})[)】)]]?[^\u4E00-\u9FA5|:|:]',
]
ContentExtractor
基于文本及符号密度的网页正文提取方法。
计算文本及符号密度
def calc_text_density(self, element):
"""
根据公式:
Ti - LTi
TDi = -----------
TGi - LTGi
Ti:节点 i 的字符串字数
LTi:节点 i 的带链接的字符串字数
TGi:节点 i 的标签数
LTGi:节点 i 的带连接的标签数
:return:
"""
ti_text = '\n'.join(self.get_all_text_of_element(element))
ti = len(ti_text)
ti = self.increase_tag_weight(ti, element)
a_tag_list = element.xpath('.//a')
lti = len(''.join(self.get_all_text_of_element(a_tag_list)))
tgi = len(element.xpath('.//*'))
ltgi = len(a_tag_list)
if (tgi - ltgi) == 0:
if not self.need_skip_ltgi(ti, lti):
return {'density': 0, 'ti_text': ti_text, 'ti': ti, 'lti': lti, 'tgi': tgi, 'ltgi': ltgi}
else:
ltgi = 0
density = (ti - lti) / (tgi - ltgi)
return {'density': density, 'ti_text': ti_text, 'ti': ti, 'lti': lti, 'tgi': tgi, 'ltgi': ltgi}
ListExtractor
import re
from gne.utils import config
from collections import deque
from lxml.html import HtmlElement
class ListExtractor:
def extract(self, element: HtmlElement, feature):
result = []
if feature.startswith('/'):
feature_element = element.xpath(feature)
else:
feature_element = element.xpath(f'//*[contains(text(), "{feature}")]')
if not feature_element:
print('找不到 feature!')
return result
parent = feature_element[0]
leaf_class = parent.attrib.get('class', '')
if leaf_class:
leaf_node = f'{parent.tag}[@class="{leaf_class}"]'
else:
leaf_node = parent.tag
is_a_tag = parent.tag == 'a'
sub_path_queue = deque([leaf_node])
while parent is not None:
parent = parent.getparent()
if parent is None:
break
path = '/'.join(sub_path_queue)
item_list = parent.xpath(path)
if len(item_list) > 3:
for item in item_list:
item_info = {'title': ''.join(item.xpath('text()'))}
if is_a_tag:
item_info['url'] = ''.join(item.xpath('@href'))
result.append(item_info)
return result
sub_path_queue.insert(0, parent.tag)
return result
MetaExtractor
TimeExtractor
DATETIME_PATTERN = [
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
"(\d{4}年\d{1,2}月\d{1,2}日)",
"(\d{2}年\d{1,2}月\d{1,2}日)",
"(\d{1,2}月\d{1,2}日)"
]
class TimeExtractor:
def __init__(self):
self.time_pattern = DATETIME_PATTERN
def extractor(self, element: HtmlElement, publish_time_xpath: str = '') -> str:
publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath')
publish_time = (self.extract_from_user_xpath(publish_time_xpath, element) # 用户指定的 Xpath 是第一优先级
or self.extract_from_meta(element) # 第二优先级从 Meta 中提取
or self.extract_from_text(element)) # 最坏的情况从正文中提取
return publish_time
TitleExtractor
h标签:h1、h2、h3
title标签
h标签和title标签文本的最长公共子串
def extract(self, element: HtmlElement, title_xpath: str = '') -> str:
title_xpath = title_xpath or config.get('title', {}).get('xpath')
title = (self.extract_by_xpath(element, title_xpath)
or self.extract_by_htag_and_title(element)
or self.extract_by_title(element)
or self.extract_by_htag(element)
)
return title.strip()
基于LLM的解析器: llm_crawler
提示词:
sys_info = '''Your task is to operate as an HTML content extractor, focusing on parsing a provided HTML segment. Your objective is to retrieve the following details directly from the raw text within the HTML, without summarizing or altering the content:
- The document's title
- The complete main content, as it appears in the HTML, comprising all textual elements considered part of the core article body
- The publication time in its original format found within the HTML
Ensure your response fits the following JSON structure, accurately reflecting the extracted data without modification:
```json
{
"title": "The Document's Exact Title",
"content": "All the unaltered primary text content from the article",
"publish_time": "Original Publication Time as per HTML"
}
It is essential that your output adheres strictly to this format, with each field filled based on the untouched information extracted directly from the HTML source.’''
## 新闻Html中的一些工具
- DOM节点规范化:normalize_node
- 移除噪声节点:remove_noise_node
- 填充图片资源主机地址: pad_host_for_iamges
```python
def pad_host_for_images(host, url):
"""
网站上的图片可能有如下几种格式:
完整的绝对路径:https://xxx.com/1.jpg
完全不含 host 的相对路径: /1.jpg
含 host 但是不含 scheme: xxx.com/1.jpg 或者 ://xxx.com/1.jpg
:param host:
:param url:
:return:
"""
if url.startswith('http'):
return url
parsed_uri = urlparse(host)
scheme = parsed_uri.scheme
if url.startswith(':'):
return f'{scheme}{url}'
if url.startswith('//'):
return f'{scheme}:{url}'
return urljoin(host, url)
修复不规范的Html:fix_html
……