数据帧收发主要函数及netdevice 结构.docx
《数据帧收发主要函数及netdevice 结构.docx》由会员分享,可在线阅读,更多相关《数据帧收发主要函数及netdevice 结构.docx(23页珍藏版)》请在冰豆网上搜索。
数据帧收发主要函数及netdevice结构
LinuxTCP/IP协议栈学习2——数据帧收发主要函数及netdevice结构
/**
*netif_rx-postbuffertothenetworkcode
*@skb:
buffertopost
*
*thisfunctionreceivesapacketfromadevicedriverandqueuesitfor
*theupper(protocol)levelstoprocess.italwayssucceeds.thebuffer
*maybedroppedduringprocessingforcongestioncontrolorbythe
*protocollayers.
*
*returnvalues:
*net_rx_success(nocongestion)
*net_rx_drop(packetwasdropped)
*
*/
intnetif_rx(structsk_buff*skb)
{
structsoftnet_data*queue;
unsignedlongflags;
/*ifnetpollwantsit,pretendweneversawit*/
if(netpoll_rx(skb))
returnnet_rx_drop;
if(!
skb->tstamp.tv64)//得到帧接收的时间
net_timestamp(skb);
/*
*thecodeisrearrangedsothatthepathisthemost
*shortwhencpuiscongested,butisstilloperating.
*/
local_irq_save(flags);
queue=&__get_cpu_var(softnet_data);//获取当前cpu的softnet_data数据
__get_cpu_var(netdev_rx_stat).total++;//当前cpu接收的帧数+1
if(queue->input_pkt_queue.qlen<=netdev_max_backlog){
//监测设备是否还有空间来存储帧,如果空间已满,表示网络阻塞严重,则返回一个错误,此后cpu将丢掉再来的帧。
if(queue->input_pkt_queue.qlen){
enqueue:
//将该帧加入到softnet_data队列
__skb_queue_tail(&queue->input_pkt_queue,skb);
local_irq_restore(flags);
returnnet_rx_success;
}
//当队列是空的时候,表明这个队列并没有被软中断所schedule,因此我们需要将此队列加入到软中断的处理链表中。
可以看到加入的正好是backlog,由于调用netif_rx的是非napi的驱动,因此backlog就是初始化时的process_backlog函数。
napi_schedule(&queue->backlog);
gotoenqueue;
}
__get_cpu_var(netdev_rx_stat).dropped++;
local_irq_restore(flags);
kfree_skb(skb);
returnnet_rx_drop;
}
//上面代码中用到一个关键的数据结构softnet_data,在网卡收发数据的时候,需要维护一个缓冲区队列,来缓存可能存在的突发数据,在协议栈中用一个队列层来表示该缓冲区,队列层位于数据链路层和网络层之间。
softnet_data就是数据链路层中的数据结构,它是一个per-cpu变量,每个cpu都有一个
/**
*netif_receive_skb-processreceivebufferfromnetwork
*@skb:
buffertoprocess
*
*netif_receive_skb()isthemainreceivedataprocessingfunction.
*italwayssucceeds.thebuffermaybedroppedduringprocessing
*forcongestioncontrolorbytheprotocollayers.
*
*thisfunctionmayonlybecalledfromsoftirqcontextandinterrupts
*shouldbeenabled.
*
*returnvalues(usuallyignored):
*net_rx_success:
nocongestion
*net_rx_drop:
packetwasdropped
*/
//netif_receive_skb是对于netif_rx的napi对等函数;它递交一个报文给内核.当一个napi兼容的驱动已耗尽接收报
文的供应,它应当重开中断,并且调用netif_rx_complete(现在是__napi_complete())来停止轮询.
intnetif_receive_skb(structsk_buff*skb)
{
structpacket_type*ptype,*pt_prev;
structnet_device*orig_dev;
structnet_device*master;
structnet_device*null_or_orig;
structnet_device*null_or_bond;
intret=net_rx_drop;
__be16type;
if(!
skb->tstamp.tv64)
net_timestamp(skb);
if(vlan_tx_tag_present(skb)&&vlan_hwaccel_do_receive(skb))
returnnet_rx_success;
/*ifwe'vegottenherethroughnapi,checknetpoll*/
if(netpoll_receive_skb(skb))
returnnet_rx_drop;
if(!
skb->skb_iif)
skb->skb_iif=skb->dev->ifindex;//记录帧的入口
null_or_orig=null;
orig_dev=skb->dev;
master=access_once(orig_dev->master);
if(master){
if(skb_bond_should_drop(skb,master))
null_or_orig=orig_dev;/*deliveronlyexactmatch*/
else
skb->dev=master;
}
__get_cpu_var(netdev_rx_stat).total++;
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
skb->mac_len=skb->network_header-skb->mac_header;
pt_prev=null;
rcu_read_lock();
#ifdefconfig_net_cls_act
if(skb->tc_verd&tc_ncls){
skb->tc_verd=clr_tc_ncls(skb->tc_verd);
gotoncls;
}
#endif
//处理ptype_all上所有的packet_type->func(),这里先提一下linux是根据packet_type通过dev_add_pack()函数来注册相应的处理函数,后面会讲如何注册,每种包对应哪个处理函数
//staticstructlist_headptype_all__read_mostly;
list_for_each_entry_rcu(ptype,&ptype_all,list){
if(ptype->dev==null_or_orig||ptype->dev==skb->dev||
ptype->dev==orig_dev){
if(pt_prev)
ret=deliver_skb(skb,pt_prev,orig_dev);//调用相应的包处理函数
pt_prev=ptype;
}
}
#ifdefconfig_net_cls_act
skb=handle_ing(skb,&pt_prev,&ret,orig_dev);
if(!
skb)
gotoout;
ncls:
#endif
//若编译内核时选上bridge,下面会执行网桥模块
skb=handle_bridge(skb,&pt_prev,&ret,orig_dev);
if(!
skb)
gotoout;
//编译内核时选上mac_vlan模块,下面才会执行
skb=handle_macvlan(skb,&pt_prev,&ret,orig_dev);
if(!
skb)
gotoout;
/*
*makesureframesreceivedonvlaninterfacesstackedon
*bondinginterfacesstillmaketheirwaytoanybasebonding
*devicethatmayhaveregisteredforaspecificptype.the
*handlermayhavetoadjustskb->devandorig_dev.
*/
null_or_bond=null;
if((skb->dev->priv_flags&iff_802_1q_vlan)&&
(vlan_dev_real_dev(skb->dev)->priv_flags&iff_bonding)){
null_or_bond=vlan_dev_real_dev(skb->dev);
}
//最后type=skb->protocol;&ptype_base[ntohs(type)&15]处理ptype_base[ntohs(type)&15]上的所有的packet_type->func(),根据第二层不同协议来进入不同的钩子函数,重要的有:
ip_rcv(),arp_rcv()
type=skb->protocol;
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type)&ptype_hash_mask],list){
if(ptype->type==type&&(ptype->dev==null_or_orig||
ptype->dev==skb->dev||ptype->dev==orig_dev||
ptype->dev==null_or_bond)){
if(pt_prev)
ret=deliver_skb(skb,pt_prev,orig_dev);
pt_prev=ptype;
}
}
if(pt_prev){
ret=pt_prev->func(skb,skb->dev,pt_prev,orig_dev);
}else{
kfree_skb(skb);
/*jamal,nowyouwillnotabletoescapeexplaining
*mehowyouweregoingtousethis.:
-)
*/
ret=net_rx_drop;
}
out:
rcu_read_unlock();
returnret;
}
/**
*dev_queue_xmit-transmitabuffer
*@skb:
buffertotransmit
*
*queueabufferfortransmissiontoanetworkdevice.thecallermust
*havesetthedeviceandpriorityandbuiltthebufferbeforecalling
*thisfunction.thefunctioncanbecalledfromaninterrupt.
*
*anegativeerrnocodeisreturnedonafailure.asuccessdoesnot
*guaranteetheframewillbetransmittedasitmaybedroppeddue
*tocongestionortrafficshaping.
*
*-----------------------------------------------------------------------------------
*inoticethismethodcanalsoreturnerrorsfromthequeuedisciplines,
*includingnet_xmit_drop,whichisapositivevalue.so,errorscanalso
*bepositive.
*
*regardlessofthereturnvalue,theskbisconsumed,soitiscurrently
*difficulttoretryasendtothismethod.(youcanbumptherefcount
*beforesendingtoholdareferenceforretryifyouarecareful.)
*
*whencallingthismethod,interruptsmustbeenabled.thisisbecause
*thebhenablecodemusthaveirqsenabledsothatitwillnotdeadlock.
*--blg
*/
intdev_queue_xmit(structsk_buff*skb)
{
structnet_device*dev=skb->dev;
structnetdev_queue*txq;
structqdisc*q;
intrc=-enomem;
/*gsowillhandlethefollowingemulationsdirectly.*/
if(netif_needs_gso(dev,skb))//如果是gso数据包,且设备支持gso数据包的处理
gotogso;
/*convertapagedskbtolinear,ifrequired*/
if(skb_needs_linearize(skb,dev)&&__skb_linearize(skb))
gotoout_kfree_skb;
/*ifpacketisnotchecksummedanddevicedoesnotsupport
*checksummingforthisprotocol,completechecksumminghere.
*/
if(skb->ip_summed==checksum_partial){
skb_set_transport_header(skb,skb->csum_start-
skb_headroom(skb));
if(!
dev_can_checksum(dev,skb)&&skb_checksum_help(skb))
gotoout_kfree_skb;
}
gso:
/*disablesoftirqsforvariouslocksbelow.also
*stopspreemptionforrcu.
*/
rcu_read_lock_bh();
txq=dev_pick_tx(dev,skb);
q=rcu_dereference_bh(txq->qdisc);
#ifdefconfig_net_cls_act
skb->tc_verd=set_tc_at(skb->tc_verd,at_egress);
#endif
if(q->enqueue){
rc=__dev_xmit_skb(skb,q,dev,txq);
gotoout;
}
/*thedevicehasnoqueue.commoncaseforsoftwaredevices:
loopback,allthesortsoftunnels...
really,itisunlikelythatnetif_tx_lockprotectionisnecessary
here.(f.e.loopbackandiptunnelsarecleanignoringstatistics
counters.)
however,itispossible,thattheyrelyonprotection
madebyushere.
checkthisandshotthelock.itisnotpronefromdeadlocks.
eithershotnoqueueqdisc,itisevensimpler8)
*/
if(dev->flags&iff_up){
intcpu=smp_processor_id();/*okbecausebhsareoff*/
if(txq->xmit_lock_owner!
=cpu){
hard_tx_lock(dev,txq,cpu);
if(!
netif_tx_queue_stopped(txq)){
rc=dev_hard_start_xmit(skb,dev,txq);
if(dev_xmit_complete(rc)){
hard_tx_unlock(dev,txq);
gotoout;
}
}
hard_tx_unlock(dev,txq);
if(net_ratelimit())
printk(kern_crit"virtualdevice%sasksto"
"queuepacket!
\n",dev->name);
}else{
/*recursionisdetected!
itispossible,
*unfortunately*/
if(net_ratelimit())
printk(kern_crit"deadlooponvirtualdevice"
"%s,fixiturgently!
\n",dev->name);
}
}
rc=-enetdown;
rcu_read_unlock_bh();
out_kfree_skb:
kfree_skb(skb);
returnrc;
out:
rcu_read_unlock_bh();
returnrc;
}
数据链路层不得不谈到structnet_device相关结构,在2.6.29之后net_device结构进行了调整,操作函数被重构到了net_device_ops中。
下面简要分析一下:
structnet_device
{
/*
thisfirstfield,name,isthebeginningofthevisiblepartofthisstructure.itcontainsthestring
thatisthenameoftheinterface.byvisible,wemeanthatthispartofthedatastructureisgeneric
anddoesn’tcontainanyprivateareasspecifictoaparticulartypeofdevice.
*/
charname[ifnamsiz];
/*devicenamehashchain*/
structhlist_nodename_hlist;
/*snmpalias*/
char*ifalias;
/*
*i/ospecificfields
*fixme:
mergetheseandstructifmapintoone
*/
unsignedlongmem_end;/*sharedmemend*/
unsignedlongmem_start;/*sharedmemstart*/
unsignedlongbase_addr;/*devicei/oaddress*/