数据清洗之网页中文过滤.docx-资源下载

数据清洗之网页中文过滤.docx

1、数据清洗之网页中文过滤数据清洗中文过滤实现编制人马飞所在部门研发中心编制时间2016年5月23日目录目录 2一、概述 3二、过滤规则 3三、实现代码 32.1表说明 32.2初始化 42.3包说明 4四、程序源码 53.1包头代码 53.2包体代码 63.2源代码附件 14一、概述根据项目要求，需将HBASE分库式数据库中存储的HTML代码按要求过滤操作，只保留中文、标点符点及部分HTML标签。现通过PL/SQL代码对其进行实现。二、过滤规则1、保留、table 标签，保留汉字及标点符号。过滤其它字符。2、保留、标签，保留其中汉字及标点符号，过滤其它字符。2、保留、h标签，保留其中汉字及

2、标点符号，滤掉其它字符。3、最终以文档，UTF-8编码返回结果。4、通过创建规则表，存储各种保留规则。三、实现代码2.1表说明表名功能描述代码UTF8_NOFILTER_CH非过滤字符表create table UTF8_NOFILTER_CH( CH NVARCHAR2(10), SM NVARCHAR2(100), BZ VARCHAR2(20);UTF8_WRITE_CH_TMP逐字符存放网页内容create table UTF8_WRITE_CH_TMP( V NVARCHAR2(1), V2 VARCHAR2(10), R INTEGER);UTF8存放测试数据，其中V列存放网页内容

3、，V2列存放处理好的网页内容，XH表示序号为主键，表示一共有多少个网页需处理。create table UTF8( V NCLOB, V2 NCLOB, XH INTEGER primary key);UTF8_WRITE_LINE_TMP将UTF8_WRITE_CH_TMP表中的字符合并为行create table UTF8_WRITE_LINE_TMP( V NVARCHAR2(4000), R INTEGER);UTF8_HANDLE_LINE_TMP存放UTF8_WRITE_LINE_TMP表中的每一行按过滤规则处理后的结果create table UTF8_HANDLE_LINE_T

4、MP( V NVARCHAR2(4000), R INTEGER);2.2初始化规则表名功能描述代码UTF8_NOFILTER_CH初始化非过滤字符数据，目前只包括，全、半角标点符号、部分需保留的HTML关键字。.scriptsinsertUTF8_NOFILTER_CH.sql2.3包说明包头子程序名功能描述接口代码（包头）filter按过滤规则批量处理UTF8表中的列“V“中的网页内容。procedure filter;filter(n_xh int)按过滤规则处理UTF8表中某一行的列“V“中的网页内容。procedure filter(n_xh int);包体子程序名功能描述接口代码（

5、包头）read_ch_from_nclob从NCLOB逐字符读取至表utf8_write_ch_tmp中。详见包体中read_ch_from_nclob过程代码read_line_from_ch将表utf8_write_ch_tmp中字符处理为行。详见包体中read_line_from_ch过程代码write_line_to_nclob_init首次写nclob时对其初始化详见包体中write_line_to_nclob_init过程代码write_line_to_nclob将处理好的行信息写入nclob字段中详见包体中write_line_to_nclobis_ch_filter功能：检测某

6、个字符是否为UTF8编码的汉字。入口：一个字符出口：1 为是，0为否详见包体中is_ch_filter函数is_sign_filter功能：检测某个字符是否为UTF8编码的全角、半角标点符号。入口：一个字符出口：1 为是，0为否详见包体中is_sign_filter函数is_valid_filter功能：检测某个字符串是否为UTF8编码的HTML关键字，具体支持列表详见“UTF8_NOFILTER_CH”表中的BZ=HTML的结果。入口：一个字符串出口：正数表示是，0为否详见包体中is_valid_filter函数process对表“UTF8_WRITE_LINE_TMP”中的每一行内容进行

7、按规则过滤操作。入口：处理前的行出口：过滤后的行详见包体中process函数handle_line按过滤规则循环处理“UTF8_WRITE_LINE_TMP”个中所有行。详见包体中handle_line函数handle_exception对处理后的行进行处理，删除无效行或对行进行修正详见包体中handle_exception过程。merge_into_nclob合并“UTF8_HANDLE_LINE_TMP”表中行并更新至UTF8表中的列“V2“列中。详见包体中merge_into_nclob过程。filter按过滤规则批量处理UTF8表中的列“V“中的网页内容。详见包体中filter过程。f

8、ilter(n_xh int)按过滤规则处理UTF8表中某一行的列“V“中的网页内容。详见包体中filter(n_xh int)过程。init过滤前的初始化工作详见包体中init过程。四、程序源码3.1包头代码create or replace package dp_filter_ch is /* 方法：filter 功能：按过滤规则批量处理UTF8表中的列“V“中的网页内容：规则如下： 1.保留中文字符 2.保留全角、半角标点符号 3.保留HTML保留字，详见规则表：utf8_nofilter_ch */ procedure filter; /* 方法：filter(n_xh int);

9、功能：按过滤规则处理UTF8表中某一行的列“V“中的网页内容：规则如下： 1.保留中文字符 2.保留全角、半角标点符号 3.保留HTML保留字，详见规则表：utf8_nofilter_ch */ procedure filter(n_xh int); end dp_filter_ch;3.2包体代码create or replace package body dp_filter_ch is p_xh int; procedure read_ch_from_nclob is directions nclob; buffer nvarchar2(1); amount int :=1; n_len

10、 int; n_exists int; begin select count(0) into n_exists from utf8_write_ch_tmp where xh = p_xh; if n_exists =0 then delete from utf8_write_ch_tmp where xh = p_xh; commit; select v,length(v) into directions,n_len from utf8 where xh = p_xh; dbms_lob.open(directions,dbms_lob.lob_readonly); for i in 1.n

11、_len loop DBMS_LOB.read(directions, amount, i, buffer); insert into utf8_write_ch_tmp(xh,r,v) values(p_xh,i,buffer); if mod(i,500) =0 then commit; end if; end loop; commit; DBMS_LOB.CLOSE(directions); end if; exception when others then null; end; procedure read_line_from_ch is v_buffer nvarchar2(400

12、0); n_offset int := 1; begin delete from utf8_write_line_tmp where xh = p_xh; commit; v_buffer:=; for i in ( select xh,v,r,ascii(v) ch from utf8_write_ch_tmp t where xh = p_xh order by r ) loop if i.ch10 and length(v_buffer)0 then if to_number(replace(asciistr(P_str),),XXXX) between n_begin and n_en

13、d then n_ret:=1; end if; end if; end if; return n_ret; end; function is_sign_filter(P_str nvarchar2) return int is n_exists int:=0; begin select count(0) into n_exists from dual where exists (select 1 from utf8_nofilter_ch t where t.bz = BDFH and t.ch = asciistr(P_str); return sign(n_exists); end; f

14、unction is_valid_filter(P_ch nvarchar2,P_pos int,P_str nvarchar2) return int is begin if P_ch =0 then n_len :=is_valid_filter(v_tmp,n_pos,P_str); v_ret := v_ret |substr(P_str,n_pos,n_len); n_pos := n_pos +n_len ; n_len :=1; -其他情况移动一个字符 elsif is_valid_filter(v_tmp,n_pos,P_str)= 0 then n_pos := n_pos

15、+1 ; n_len :=1; end if; end loop; return v_ret; end; procedure handle_line is v_buffer nvarchar2(4000); n_offset int := 1; begin delete utf8_handle_line_tmp where xh = p_xh ; v_buffer:=; for i in ( select r,v from utf8_write_line_tmp t where xh = p_xh order by r ) loop v_buffer := v_buffer | process

16、(i.v); insert into utf8_handle_line_tmp(xh,r,v) values(p_xh,i.r,v_buffer); v_buffer:=; end loop; commit; end; procedure merge_into_nclob is begin for i in(select r,v,ascii(v),lengthb(v) from utf8_handle_line_tmp where xh = p_xh order by r) loop write_line_to_nclob(i.v); end loop; end; procedure hand

17、le_exception is begin -删除一行中只有一个字符且该字符为ASCII=10 delete from utf8_write_line_tmp t where length(v)=1 and ascii(v)=10 ; -删除一行中ASCII字符为空或每行长度为1或2的行。 delete from utf8_handle_line_tmp t where ascii(t.v) is null or lengthb(t.v) in(1,2); commit; -将连续两个字符为以下情况的都替换为空 for i in(select r,v,ascii(v),lengthb(v) f

18、rom utf8_handle_line_tmp where xh = p_xh order by r) loop update utf8_handle_line_tmp set v=replace(v,.) where xh = p_xh and r=i.r; update utf8_handle_line_tmp set v=replace(v,) where xh = p_xh and r=i.r; update utf8_handle_line_tmp set v=replace(v,.,) where xh = p_xh and r=i.r; update utf8_handle_l

19、ine_tmp set v=replace(v,.,) where xh = p_xh and r=i.r; update utf8_handle_line_tmp set v=replace(v,.) where xh = p_xh and r=i.r; end loop; -以.或,开头的去掉开头字符,或以.结尾的字符 for i in (select rowid from utf8_handle_line_tmp where xh = p_xh and ( v like ,% or v like .% or v like %. ) ) loop update utf8_handle_li

20、ne_tmp set v=substr(v,2) where rowid=i.rowid; end loop; -以.结尾的字符 for i in (select rowid from utf8_handle_line_tmp where xh = p_xh and v like %. ) loop update utf8_handle_line_tmp set v=substr(v,1,length(v)-1) where rowid=i.rowid; end loop; -当一行内容只有以下字符时，删除改行 delete from utf8_handle_line_tmp where xh

21、 = p_xh and v is null; delete from utf8_handle_line_tmp where xh = p_xh and v=.; delete from utf8_handle_line_tmp where xh = p_xh and v=,; delete from utf8_handle_line_tmp where xh = p_xh and v=.; delete from utf8_handle_line_tmp where xh = p_xh and v=.; commit; end; procedure init(n_xh int) is begin p_xh := n_xh; update utf8 set v2=null where xh= p_xh; commit; end; pr

邮箱/手机：
温馨提示：	快捷下载时，用户名和密码都是您填写的邮箱或者手机号，方便查询和重复下载（系统自动生成）。如填写123，账号就是123，密码也是123。
特别说明：	请自助下载，系统不会自动发送文件的哦；如果您已付费，想二次下载，请登录后访问：我的下载记录
支付方式：
验证码：	换一换

账号：
密码：
验证码：	换一换
当日自动登录忘记密码？