tcp_input.c - net/ipv4/tcp_input.c - Linux source code 2.2.27-rc2

/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
 * Version:	$Id: tcp_input.c,v 1.164.2.25 2001/05/24 22:33:21 davem Exp $
 *
 * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Matthew Dillon, <dillon@apollo.west.oic.com>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 */

/*
 * Changes:
 *		Pedro Roque	:	Fast Retransmit/Recovery.
 *					Two receive queues.
 *					Retransmit queue handled by TCP.
 *					Better retransmit timer handling.
 *					New congestion avoidance.
 *					Header prediction.
 *					Variable renaming.
 *
 *		Eric		:	Fast Retransmit.
 *		Randy Scott	:	MSS option defines.
 *		Eric Schenk	:	Fixes to slow start algorithm.
 *		Eric Schenk	:	Yet another double ACK bug.
 *		Eric Schenk	:	Delayed ACK bug fixes.
 *		Eric Schenk	:	Floyd style fast retrans war avoidance.
 *		David S. Miller	:	Don't allow zero congestion window.
 *		Eric Schenk	:	Fix retransmitter so that it sends
 *					next packet on ack of previous packet.
 *		Andi Kleen	:	Moved open_request checking here
 *					and process RSTs for open_requests.
 *		Andi Kleen	:	Better prune_queue, and other fixes.
 *		Andrey Savochkin:	Fix RTT measurements in the presnce of
 *					timestamps.
 *		Andrey Savochkin:	Check sequence numbers correctly when
 *					removing SACKs due to in sequence incoming
 *					data segments.
 *		Andi Kleen:		Make sure we never ack data there is not
 *					enough room for. Also make this condition
 *					a fatal error if it might still happen.
 *		Andi Kleen:		Add tcp_measure_rcv_mss to make 
 *					connections with MSS<min(MTU,ann. MSS)
 *					work without delayed acks. 
 *		Andi Kleen:		Process packets with PSH set in the
 *					fast path.
 *		Vincent Zweije		Fix TIME-WAIT FIN ACK bug.
 */

#include <linux/config.h>
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <net/tcp.h>
#include <linux/ipsec.h>

#ifdef CONFIG_SYSCTL
#define SYNC_INIT 0 /* let the user enable it */
#else
#define SYNC_INIT 1
#endif

extern int sysctl_tcp_fin_timeout;

/* These are on by default so the code paths get tested.
 * For the final 2.2 this may be undone at our discretion. -DaveM
 */
int sysctl_tcp_timestamps = 1;
int sysctl_tcp_window_scaling = 1;
int sysctl_tcp_sack = 1;

int sysctl_tcp_syncookies = SYNC_INIT; 
int sysctl_tcp_stdurg;
int sysctl_tcp_rfc1337;

static int prune_queue(struct sock *sk);

/* There is something which you must keep in mind when you analyze the
 * behavior of the tp->ato delayed ack timeout interval.  When a
 * connection starts up, we want to ack as quickly as possible.  The
 * problem is that "good" TCP's do slow start at the beginning of data
 * transmission.  The means that until we send the first few ACK's the
 * sender will sit on his end and only queue most of his data, because
 * he can only send snd_cwnd unacked packets at any given time.  For
 * each ACK we send, he increments snd_cwnd and transmits more of his
 * queue.  -DaveM
 */
static void tcp_delack_estimator(struct tcp_opt *tp)
{
	if(tp->ato == 0) {
		tp->lrcvtime = tcp_time_stamp;

		/* Help sender leave slow start quickly,
		 * and also makes sure we do not take this
		 * branch ever again for this connection.
		 */
		tp->ato = 1;
		tcp_enter_quickack_mode(tp);
	} else {
		int m = tcp_time_stamp - tp->lrcvtime;

		tp->lrcvtime = tcp_time_stamp;
		if(m <= 0)
			m = 1;
		if(m > tp->rto)
			tp->ato = tp->rto;
		else {
			/* This funny shift makes sure we
			 * clear the "quick ack mode" bit.
			 */
			tp->ato = ((tp->ato << 1) >> 2) + m;
		}
	}
}

/* 
 * Remember to send an ACK later.
 */
static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th, 
					struct sk_buff *skb)
{
	tp->delayed_acks++; 

	/* Tiny-grams with PSH set artifically deflate our
	 * ato measurement, but with a lower bound.
	 */
	if(th->psh && (skb->len < (tp->mss_cache >> 1))) {
		/* Preserve the quickack state. */
		if((tp->ato & 0x7fffffff) > HZ/50)
			tp->ato = ((tp->ato & 0x80000000) |
				   (HZ/50));
	}
} 

/* Called to compute a smoothed rtt estimate. The data fed to this
 * routine either comes from timestamps, or from segments that were
 * known _not_ to have been retransmitted [see Karn/Partridge
 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 * piece by Van Jacobson.
 * NOTE: the next three routines used to be one big routine.
 * To save cycles in the RFC 1323 implementation it was better to break
 * it up into three procedures. -- erics
 */

static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
{
	long m = mrtt; /* RTT */

	/*	The following amusing code comes from Jacobson's
	 *	article in SIGCOMM '88.  Note that rtt and mdev
	 *	are scaled versions of rtt and mean deviation.
	 *	This is designed to be as fast as possible 
	 *	m stands for "measurement".
	 *
	 *	On a 1990 paper the rto value is changed to:
	 *	RTO = rtt + 4 * mdev
	 */
	if(m == 0)
		m = 1;
	if (tp->srtt != 0) {
		m -= (tp->srtt >> 3);	/* m is now error in rtt est */
		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
		if (m < 0)
			m = -m;		/* m is now abs(error) */
		m -= (tp->mdev >> 2);   /* similar update on mdev */
		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */
	} else {
		/* no previous measure. */
		tp->srtt = m<<3;	/* take the measured time to be rtt */
		tp->mdev = m<<2;	/* make sure rto = 3*rtt */
	}
}

/* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */

static __inline__ void tcp_set_rto(struct tcp_opt *tp)
{
	tp->rto = (tp->srtt >> 3) + max(HZ/5, tp->mdev);
	tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
}
 

/* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
 * on packet lifetime in the internet. We need the HZ/5 lower
 * bound to behave correctly against BSD stacks with a fixed
 * delayed ack.
 * FIXME: It's not entirely clear this lower bound is the best
 * way to avoid the problem. Is it possible to drop the lower
 * bound and still avoid trouble with BSD stacks? Perhaps
 * some modification to the RTO calculation that takes delayed
 * ack bias into account? This needs serious thought. -- erics
 */
static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
{
	if (tp->rto > 120*HZ)
		tp->rto = 120*HZ;
	if (tp->rto < HZ/5)
		tp->rto = HZ/5;
}

/* WARNING: this must not be called if tp->saw_timestamp was false. */
extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp,
					     __u32 start_seq, __u32 end_seq)
{
	/* It is start_seq <= last_ack_seq combined
	   with in window check. If start_seq<=last_ack_seq<=rcv_nxt,
	   then segment is in window if end_seq>=rcv_nxt.
	 */
	if (!after(start_seq, tp->last_ack_sent) &&
	    !before(end_seq, tp->rcv_nxt)) {
		/* PAWS bug workaround wrt. ACK frames, the PAWS discard
		 * extra check below makes sure this can only happen
		 * for pure ACK frames.  -DaveM
		 *
		 * Plus: expired timestamps.
		 *
		 * Plus: resets failing PAWS.
		 */
		if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) {
			tp->ts_recent = tp->rcv_tsval;
			tp->ts_recent_stamp = tcp_time_stamp;
		}
	}
}

#define PAWS_24DAYS	(HZ * 60 * 60 * 24 * 24)

extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len)
{
	return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
		(s32)(tcp_time_stamp - tp->ts_recent_stamp) < PAWS_24DAYS &&
		/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */
		len != (th->doff * 4));
}


/* Check segment sequence number for validity.
 *
 * Segment controls are considered valid, if the segment
 * fits to the window after truncation to the window. Acceptability
 * of data (and SYN, FIN, of course) is checked separately.
 * See tcp_data_queue(), for example.
 *
 * Also, controls (RST is main one) are accepted using last_ack_sent instead
 * of RCV.NXT. Peer still did not advance his SND.UNA when we
 * delayed ACK, so that hisSND.UNA<=last_ack_sent.
 * (borrowed from freebsd)
 */
static inline int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
{
	return	!before(end_seq, tp->last_ack_sent) &&
		!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
}

/* When we get a reset we do this. */
static void tcp_reset(struct sock *sk)
{
	unsigned char orig_state = sk->state;

	sk->zapped = 1;

	/* We want the right error as BSD sees it (and indeed as we do). */
	switch (orig_state) {
		case TCP_SYN_SENT:
			sk->err = ECONNREFUSED;
			break;
		case TCP_CLOSE_WAIT:
			sk->err = EPIPE;
			break;
		default:
			sk->err = ECONNRESET;
	};
	tcp_set_state(sk, TCP_CLOSE);
	if (orig_state == TCP_SYN_SENT) {
		/* Back out identity changes done by connect.
		 * The move to TCP_CLOSE has unhashed us and
		 * killed the bind bucket reference, making this
		 * safe. -DaveM
		 */
		sk->dport = 0;
		sk->daddr = 0;
		sk->num = 0;
		tcp_clear_xmit_timer(sk, TIME_RETRANS);
	}
	sk->shutdown = SHUTDOWN_MASK;
	if (!sk->dead) 
		sk->error_report(sk);
}

/* This tags the retransmission queue when SACKs arrive. */
static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	int i = nsacks;

	while(i--) {
		struct sk_buff *skb = skb_peek(&sk->write_queue);
		__u32 start_seq = ntohl(sp->start_seq);
		__u32 end_seq = ntohl(sp->end_seq);
		int fack_count = 0;

		while((skb != NULL) &&
		      (skb != tp->send_head) &&
		      (skb != (struct sk_buff *)&sk->write_queue)) {
			/* The retransmission queue is always in order, so
			 * we can short-circuit the walk early.
			 */
			if(after(TCP_SKB_CB(skb)->seq, end_seq))
				break;

			/* We play conservative, we don't allow SACKS to partially
			 * tag a sequence space.
			 */
			fack_count++;
			if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
			   !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
				/* If this was a retransmitted frame, account for it. */
				if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) &&
				   tp->retrans_out)
					tp->retrans_out--;
				TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;

				/* RULE: All new SACKs will either decrease retrans_out
				 *       or advance fackets_out.
				 */
				if(fack_count > tp->fackets_out)
					tp->fackets_out = fack_count;
			}
			skb = skb->next;
		}
		sp++; /* Move on to the next SACK block. */
	}
}

/* Look for tcp options. Normally only called on SYN and SYNACK packets.
 * But, this can also be called on packets in the established flow when
 * the fast version below fails.
 */
void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
{
	unsigned char *ptr;
	int length=(th->doff*4)-sizeof(struct tcphdr);
	int saw_mss = 0;

	ptr = (unsigned char *)(th + 1);
	tp->saw_tstamp = 0;

	while(length>0) {
	  	int opcode=*ptr++;
		int opsize;

		switch (opcode) {
			case TCPOPT_EOL:
				goto check_syn;
			case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
				length--;
				continue;
			default:
				opsize=*ptr++;
				if (opsize < 2) /* "silly options" */
					goto check_syn;
				if (opsize > length)
					goto check_syn;	/* don't parse partial options */
	  			switch(opcode) {
				case TCPOPT_MSS:
					if(opsize==TCPOLEN_MSS && th->syn) {
						u16 in_mss = ntohs(*(__u16 *)ptr);
						if (in_mss == 0)
							in_mss = 536;
						if (tp->mss_clamp > in_mss)
							tp->mss_clamp = in_mss;
						saw_mss = 1;
					}
					break;
				case TCPOPT_WINDOW:
					if(opsize==TCPOLEN_WINDOW && th->syn)
						if (!no_fancy && sysctl_tcp_window_scaling) {
							tp->wscale_ok = 1;
							tp->snd_wscale = *(__u8 *)ptr;
							if(tp->snd_wscale > 14) {
								if(net_ratelimit())
									printk("tcp_parse_options: Illegal window "
									       "scaling value %d >14 received.",
									       tp->snd_wscale);
								tp->snd_wscale = 14;
							}
						}
					break;
				case TCPOPT_TIMESTAMP:
					if(opsize==TCPOLEN_TIMESTAMP) {
						if (sysctl_tcp_timestamps && !no_fancy) {
							tp->tstamp_ok = 1;
							tp->saw_tstamp = 1;
							tp->rcv_tsval = ntohl(*(__u32 *)ptr);
							tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
						}
					}
					break;
				case TCPOPT_SACK_PERM:
					if(opsize==TCPOLEN_SACK_PERM && th->syn) {
						if (sysctl_tcp_sack && !no_fancy) {
							tp->sack_ok = 1;
							tp->num_sacks = 0;
						}
					}
					break;

				case TCPOPT_SACK:
					if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
					   sysctl_tcp_sack && (sk != NULL) && !th->syn) {
						int sack_bytes = opsize - TCPOLEN_SACK_BASE;

						if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
							int num_sacks = sack_bytes >> 3;
							struct tcp_sack_block *sackp;

							sackp = (struct tcp_sack_block *)ptr;
							tcp_sacktag_write_queue(sk, sackp, num_sacks);
						}
					}
	  			};
	  			ptr+=opsize-2;
	  			length-=opsize;
	  	};
	}
check_syn:
	if (th->syn && saw_mss == 0)
		tp->mss_clamp = 536;
}

/* Fast parse options. This hopes to only see timestamps.
 * If it is wrong it falls back on tcp_parse_options().
 */
static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
{
	if (th->doff == sizeof(struct tcphdr)>>2) {
		tp->saw_tstamp = 0;
		return 0;
	} else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
		__u32 *ptr = (__u32 *)(th + 1);
		if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
					     | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
			tp->saw_tstamp = 1;
			tp->rcv_tsval = ntohl(*++ptr);
			tp->rcv_tsecr = ntohl(*++ptr);
			return 1;
		}
	}
	tcp_parse_options(sk, th, tp, 0);
	return 1;
}

#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/

static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
{
	if (tp->dup_acks > 3)
		tp->snd_cwnd = (tp->snd_ssthresh);

	tp->dup_acks = 0;
}

/* NOTE: This code assumes that tp->dup_acks gets cleared when a
 * retransmit timer fires.
 */
static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	/* Note: If not_dup is set this implies we got a
	 * data carrying packet or a window update.
	 * This carries no new information about possible
	 * lost packets, so we have to ignore it for the purposes
	 * of counting duplicate acks. Ideally this does not imply we
	 * should stop our fast retransmit phase, more acks may come
	 * later without data to help us. Unfortunately this would make
	 * the code below much more complex. For now if I see such
	 * a packet I clear the fast retransmit phase.
	 */
	if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
		/* This is the standard reno style fast retransmit branch. */

                /* 1. When the third duplicate ack is received, set ssthresh 
                 * to one half the current congestion window, but no less 
                 * than two segments. Retransmit the missing segment.
                 */
		if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
			tp->dup_acks++;
			if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
                                tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
                                tp->snd_cwnd = (tp->snd_ssthresh + 3);
				tp->high_seq = tp->snd_nxt;
				if(!tp->fackets_out)
					tcp_retransmit_skb(sk,
							   skb_peek(&sk->write_queue));
				else
					tcp_fack_retransmit(sk);
                                tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
			}
		} else if (++tp->dup_acks > 3) {
			/* 2. Each time another duplicate ACK arrives, increment 
			 * cwnd by the segment size. [...] Transmit a packet...
			 *
			 * Packet transmission will be done on normal flow processing
			 * since we're not in "retransmit mode".  We do not use
			 * duplicate ACKs to artificially inflate the congestion
			 * window when doing FACK.
			 */
			if(!tp->fackets_out) {
				tp->snd_cwnd++;
			} else {
				/* Fill any further holes which may have
				 * appeared.
				 *
				 * We may want to change this to run every
				 * further multiple-of-3 dup ack increments,
				 * to be more robust against out-of-order
				 * packet delivery.  -DaveM
				 */
				tcp_fack_retransmit(sk);
			}
		}
	} else if (tp->high_seq != 0) {
		/* In this branch we deal with clearing the Floyd style
		 * block on duplicate fast retransmits, and if requested
		 * we do Hoe style secondary fast retransmits.
		 */
		if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
			/* Once we have acked all the packets up to high_seq
			 * we are done this fast retransmit phase.
			 * Alternatively data arrived. In this case we
			 * Have to abort the fast retransmit attempt.
			 * Note that we do want to accept a window
			 * update since this is expected with Hoe's algorithm.
			 */
			clear_fast_retransmit(tp);

			/* After we have cleared up to high_seq we can
			 * clear the Floyd style block.
			 */
			if (!before(ack, tp->high_seq)) {
				tp->high_seq = 0;
				tp->fackets_out = 0;
			}
		} else if (tp->dup_acks >= 3) {
			if (!tp->fackets_out) {
				/* Hoe Style. We didn't ack the whole
				 * window. Take this as a cue that
				 * another packet was lost and retransmit it.
				 * Don't muck with the congestion window here.
				 * Note that we have to be careful not to
				 * act if this was a window update and it
				 * didn't ack new data, since this does
				 * not indicate a packet left the system.
				 * We can test this by just checking
				 * if ack changed from snd_una, since
				 * the only way to get here without advancing
				 * from snd_una is if this was a window update.
				 */
				if (ack != tp->snd_una && before(ack, tp->high_seq)) {
                                	tcp_retransmit_skb(sk,
							   skb_peek(&sk->write_queue));
                                	tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
				}
			} else {
				/* FACK style, fill any remaining holes in
				 * receiver's queue.
				 */
				tcp_fack_retransmit(sk);
			}
		}
	}
}

/* This is Jacobson's slow start and congestion avoidance. 
 * SIGCOMM '88, p. 328.
 */
static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
{
        if (tp->snd_cwnd <= tp->snd_ssthresh) {
                /* In "safe" area, increase. */
                tp->snd_cwnd++;
	} else {
                /* In dangerous area, increase slowly.
		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
		 */
		if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
			tp->snd_cwnd++;
			tp->snd_cwnd_cnt=0;
		} else
			tp->snd_cwnd_cnt++;
        }       
}

/* Remove acknowledged frames from the retransmission queue. */
static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
			       __u32 *seq, __u32 *seq_rtt)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	struct sk_buff *skb;
	__u32 now = tcp_time_stamp;
	int acked = 0;

	/* If we are retransmitting, and this ACK clears up to
	 * the retransmit head, or further, then clear our state.
	 */
	if (tp->retrans_head != NULL &&
	    !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq))
		tp->retrans_head = NULL;

	while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
		struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 
		__u8 sacked = scb->sacked;
		
		/* If our packet is before the ack sequence we can
		 * discard it as it's confirmed to have arrived at
		 * the other end.
		 */
		if (after(scb->end_seq, ack))
			break;

		/* Initial outgoing SYN's get put onto the write_queue
		 * just like anything else we transmit.  It is not
		 * true data, and if we misinform our callers that
		 * this ACK acks real data, we will erroneously exit
		 * connection startup slow start one packet too
		 * quickly.  This is severely frowned upon behavior.
		 */
		if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out)
			tp->retrans_out--;
		if(!(scb->flags & TCPCB_FLAG_SYN)) {
			acked |= FLAG_DATA_ACKED;
			if(sacked & TCPCB_SACKED_RETRANS)
				acked |= FLAG_RETRANS_DATA_ACKED;
			if(tp->fackets_out)
				tp->fackets_out--;
		} else {
			/* This is pure paranoia. */
			tp->retrans_head = NULL;
		}		
		tp->packets_out--;
		*seq = scb->seq;
		*seq_rtt = now - scb->when;
		__skb_unlink(skb, skb->list);
		kfree_skb(skb);
	}
	return acked;
}

static void tcp_ack_probe(struct sock *sk, __u32 ack)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	
	/* Our probe was answered. */
	tp->probes_out = 0;
	
	/* Was it a usable window open? */

	/* should always be non-null */
	if (tp->send_head != NULL &&
	    !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
		tp->backoff = 0;
		tp->pending = 0;
		tcp_clear_xmit_timer(sk, TIME_PROBE0);
	} else {
		tcp_reset_xmit_timer(sk, TIME_PROBE0,
				     min(tp->rto << tp->backoff, 120*HZ));
	}
}
 
/* Should we open up the congestion window? */
static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
{
	/* Data must have been acked. */
	if ((flag & FLAG_DATA_ACKED) == 0)
		return 0;

	/* Some of the data acked was retransmitted somehow? */
	if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) {
		/* We advance in all cases except during
		 * non-FACK fast retransmit/recovery.
		 */
		if (tp->fackets_out != 0 ||
		    tp->retransmits != 0)
			return 1;

		/* Non-FACK fast retransmit does it's own
		 * congestion window management, don't get
		 * in the way.
		 */
		return 0;
	}

	/* New non-retransmitted data acked, always advance.  */
	return 1;
}

/* Read draft-ietf-tcplw-high-performance before mucking
 * with this code. (Superceeds RFC1323)
 */
static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
			       u32 seq, u32 ack, int flag)
{
	__u32 seq_rtt;

	/* RTTM Rule: A TSecr value received in a segment is used to
	 * update the averaged RTT measurement only if the segment
	 * acknowledges some new data, i.e., only if it advances the
	 * left edge of the send window.
	 *
	 * See draft-ietf-tcplw-high-performance-00, section 3.3.
	 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
	 */
	if (!(flag & FLAG_DATA_ACKED))
		return;

	seq_rtt = tcp_time_stamp - tp->rcv_tsecr;
	tcp_rtt_estimator(tp, seq_rtt);
	if (tp->retransmits) {
		if (tp->packets_out == 0) {
			tp->retransmits = 0;
			tp->backoff = 0;
			tcp_set_rto(tp);
		} else {
			/* Still retransmitting, use backoff */
			tcp_set_rto(tp);
			tp->rto = tp->rto << tp->backoff;
		}
	} else {
		tcp_set_rto(tp);
	}

	tcp_bound_rto(tp);
}

static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
{
	struct sk_buff *skb = skb_peek(&sk->write_queue);

	/* Some data was ACK'd, if still retransmitting (due to a
	 * timeout), resend more of the retransmit queue.  The
	 * congestion window is handled properly by that code.
	 */
	if (tp->retransmits) {
		tcp_xmit_retransmit_queue(sk);
		tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
	} else {
		__u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
		if ((__s32)when < 0)
			when = 1;
		tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
	}
}

/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, struct tcphdr *th, 
		   u32 ack_seq, u32 ack, int len)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	u32 nwin = ntohs(th->window) << tp->snd_wscale;
	int flag = 0;
	u32 seq = 0;
	u32 seq_rtt = 0;

	if(sk->zapped)
		return(1);	/* Dead, can't ack any more so why bother */

	if (tp->pending == TIME_KEEPOPEN) {
	  	tp->probes_out = 0;
		tp->pending = 0;
	}

	tp->rcv_tstamp = tcp_time_stamp;

	/* If the ack is newer than sent or older than previous acks
	 * then we can probably ignore it.
	 */
	if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
		goto uninteresting_ack;

	/* If there is data set flag 1 */
	if (len != th->doff*4) {
		flag |= FLAG_DATA;
		tcp_delack_estimator(tp);
	}

	/* Update our send window. */

	/* This is the window update code as per RFC 793
	 * snd_wl{1,2} are used to prevent unordered
	 * segments from shrinking the window 
	 */
	if (after(ack_seq, tp->snd_wl1) ||
	    (tp->snd_wl1 == ack_seq &&
	     (after(ack, tp->snd_wl2) ||
	      (tp->snd_wl2 == ack && nwin > tp->snd_wnd)))) {
		flag |= FLAG_WIN_UPDATE;
		tp->snd_wl1 = ack_seq;
		tp->snd_wl2 = ack;
		tp->snd_wnd = nwin;

		if (nwin > tp->max_window)
			tp->max_window = nwin;
	} else if (after(ack, tp->snd_una)) {
		/* Bad case. Window update is not accepted.
		 * We will lockup. Break RFC to survive. */
		tp->snd_wnd -= min(ack-tp->snd_una, tp->snd_wnd);
	}

	/* We passed data and got it acked, remove any soft error
	 * log. Something worked...
	 */
	sk->err_soft = 0;

	/* If this ack opens up a zero window, clear backoff.  It was
	 * being used to time the probes, and is probably far higher than
	 * it needs to be for normal retransmission.
	 */
	if (tp->pending == TIME_PROBE0)
		tcp_ack_probe(sk, ack);

	/* See if we can take anything off of the retransmit queue. */
	flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);

	/* We must do this here, before code below clears out important
	 * state contained in tp->fackets_out and tp->retransmits.  -DaveM
	 */
	if (should_advance_cwnd(tp, flag))
		tcp_cong_avoid(tp);

	/* If we have a timestamp, we always do rtt estimates. */
	if (tp->saw_tstamp) {
		tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
	} else {
		/* If we were retransmiting don't count rtt estimate. */
		if (tp->retransmits) {
			if (tp->packets_out == 0) {
				tp->retransmits = 0;
			}
		} else {
			/* We don't have a timestamp. Can only use
			 * packets that are not retransmitted to determine
			 * rtt estimates. Also, we must not reset the
			 * backoff for rto until we get a non-retransmitted
			 * packet. This allows us to deal with a situation
			 * where the network delay has increased suddenly.
			 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
			 */
			if (flag & FLAG_DATA_ACKED) {
				if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
					tp->backoff = 0;
					tcp_rtt_estimator(tp, seq_rtt);
					tcp_set_rto(tp);
					tcp_bound_rto(tp);
				}
			}
		}
	}

	if (tp->packets_out) {
		if (flag & FLAG_DATA_ACKED)
			tcp_ack_packets_out(sk, tp);
	} else {
		tcp_clear_xmit_timer(sk, TIME_RETRANS);
		tp->fackets_out = 0;
		tp->retrans_out = 0;
	}

	flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
	if ((ack == tp->snd_una	&& tp->packets_out && flag == 0) ||
	    (tp->high_seq != 0)) {
		tcp_fast_retrans(sk, ack, flag);
	} else {
		/* Clear any aborted fast retransmit starts. */
		tp->dup_acks = 0;
	}
	/* It is not a brain fart, I thought a bit now. 8)
	 *
	 * Forward progress is indicated, if:
	 *   1. the ack acknowledges new data.
	 *   2. or the ack is duplicate, but it is caused by new segment
	 *      arrival. This case is filtered by:
	 *      - it contains no data, syn or fin.
	 *      - it does not update window.
	 *   3. or new SACK. It is difficult to check, so that we ignore it.
	 *
	 * Forward progress is also indicated by arrival new data,
	 * which was caused by window open from our side. This case is more
	 * difficult and it is made (alas, incorrectly) in tcp_data_queue().
	 *                                              --ANK (990513)
	 */
	if (ack != tp->snd_una || (flag == 0 && !th->fin))
		dst_confirm(sk->dst_cache);

	/* Remember the highest ack received. */
	tp->snd_una = ack;
	return 1;

uninteresting_ack:
	SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
	return 0;
}

/* New-style handling of TIME_WAIT sockets. */
extern void tcp_tw_schedule(struct tcp_tw_bucket *tw);
extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw);
extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);

void tcp_timewait_kill(struct tcp_tw_bucket *tw)
{
	struct tcp_bind_bucket *tb = tw->tb;

	/* Disassociate with bind bucket. */
	if(tw->bind_next)
		tw->bind_next->bind_pprev = tw->bind_pprev;
	*(tw->bind_pprev) = tw->bind_next;
	if (tb->owners == NULL) {
		if (tb->next)
			tb->next->pprev = tb->pprev;
		*(tb->pprev) = tb->next;
		kmem_cache_free(tcp_bucket_cachep, tb);
	}

	/* Unlink from established hashes. */
	if(tw->next)
		tw->next->pprev = tw->pprev;
	*tw->pprev = tw->next;

	/* We decremented the prot->inuse count when we entered TIME_WAIT
	 * and the sock from which this came was destroyed.
	 */
	tw->sklist_next->sklist_prev = tw->sklist_prev;
	tw->sklist_prev->sklist_next = tw->sklist_next;

	/* Ok, now free it up. */
	kmem_cache_free(tcp_timewait_cachep, tw);
}

/* We come here as a special case from the AF specific TCP input processing,
 * and the SKB has no owner.  Essentially handling this is very simple,
 * we just keep silently eating rx'd packets, acking them if necessary,
 * until none show up for the entire timeout period. 
 *
 * Return 0, TCP_TW_ACK, TCP_TW_RST
 */
enum tcp_tw_status 
tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
			       struct tcphdr *th, unsigned len)
{
	/*	RFC 1122:
	 *	"When a connection is [...] on TIME-WAIT state [...]
	 *	[a TCP] MAY accept a new SYN from the remote TCP to
	 *	reopen the connection directly, if it:
	 *	
	 *	(1)  assigns its initial sequence number for the new
	 *	connection to be larger than the largest sequence
	 *	number it used on the previous connection incarnation,
	 *	and
	 *
	 *	(2)  returns to TIME-WAIT state if the SYN turns out 
	 *	to be an old duplicate".
	 */
	if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) {
		struct sock *sk;
		struct tcp_func *af_specific = tw->af_specific;
		__u32 isn;

		isn = tw->snd_nxt + 128000;
		if(isn == 0)
			isn++;
		tcp_tw_deschedule(tw);
		tcp_timewait_kill(tw);
		sk = af_specific->get_sock(skb, th);
		if(sk == NULL ||
		   !ipsec_sk_policy(sk,skb) ||
		   atomic_read(&sk->sock_readers) != 0)
			return 0;
		skb_set_owner_r(skb, sk);
		af_specific = sk->tp_pinfo.af_tcp.af_specific;
		if(af_specific->conn_request(sk, skb, isn) < 0)
			return TCP_TW_RST; /* Toss a reset back. */
		return 0; /* Discard the frame. */
	}

	/* Check RST or SYN */
	if(th->rst) {
		/* This is TIME_WAIT assasination, in two flavors.
		 * Oh well... nobody has a sufficient solution to this
		 * protocol bug yet.
		 */
		if(sysctl_tcp_rfc1337 == 0) {
			tcp_tw_deschedule(tw);
			tcp_timewait_kill(tw);
		}
		return 0;
	} else {
		/* In this case we must reset the TIMEWAIT timer. */
		if(th->ack)
			tcp_tw_reschedule(tw);
	}
	/* Ack old packets if necessary */ 
	if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt) &&
	    (len > (th->doff * 4) || th->fin))
		return TCP_TW_ACK; 
	return 0; 
}

/* Enter the time wait state.  This is always called from BH
 * context.  Essentially we whip up a timewait bucket, copy the
 * relevant info into it from the SK, and mess with hash chains
 * and list linkage.
 */
static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
{
	struct sock **head, *sktw;

	/* Step 1: Remove SK from established hash. */
	if(sk->next)
		sk->next->pprev = sk->pprev;
	*sk->pprev = sk->next;
	sk->pprev = NULL;
	tcp_reg_zap(sk);

	/* Step 2: Put TW into bind hash where SK was. */
	tw->tb = (struct tcp_bind_bucket *)sk->prev;
	if((tw->bind_next = sk->bind_next) != NULL)
		sk->bind_next->bind_pprev = &tw->bind_next;
	tw->bind_pprev = sk->bind_pprev;
	*sk->bind_pprev = (struct sock *)tw;
	sk->prev = NULL;

	/* Step 3: Same for the protocol sklist. */
	(tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw;
	(tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw;
	sk->sklist_next = NULL;
	sk->prot->inuse--;

	/* Step 4: Hash TW into TIMEWAIT half of established hash table. */
	head = &tcp_ehash[sk->hashent + (tcp_ehash_size/2)];
	sktw = (struct sock *)tw;
	if((sktw->next = *head) != NULL)
		(*head)->pprev = &sktw->next;
	*head = sktw;
	sktw->pprev = head;
}

void tcp_time_wait(struct sock *sk)
{
	struct tcp_tw_bucket *tw;

	tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
	if(tw != NULL) {
		/* Give us an identity. */
		tw->daddr	= sk->daddr;
		tw->rcv_saddr	= sk->rcv_saddr;
		tw->bound_dev_if= sk->bound_dev_if;
		tw->num		= sk->num;
		tw->state	= TCP_TIME_WAIT;
		tw->sport	= sk->sport;
		tw->dport	= sk->dport;
		tw->family	= sk->family;
		tw->reuse	= sk->reuse;
		tw->rcv_nxt	= sk->tp_pinfo.af_tcp.rcv_nxt;
		tw->snd_nxt     = sk->tp_pinfo.af_tcp.snd_nxt;
		tw->window	= tcp_select_window(sk);
		tw->af_specific	= sk->tp_pinfo.af_tcp.af_specific;

#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
		if(tw->family == PF_INET6) {
			memcpy(&tw->v6_daddr,
			       &sk->net_pinfo.af_inet6.daddr,
			       sizeof(struct in6_addr));
			memcpy(&tw->v6_rcv_saddr,
			       &sk->net_pinfo.af_inet6.rcv_saddr,
			       sizeof(struct in6_addr));
		}
#endif
		/* Linkage updates. */
		tcp_tw_hashdance(sk, tw);

		/* Get the TIME_WAIT timeout firing. */
		tcp_tw_schedule(tw);

		/* CLOSE the SK. */
		if(sk->state == TCP_ESTABLISHED)
			tcp_statistics.TcpCurrEstab--;
		sk->state = TCP_CLOSE;
		net_reset_timer(sk, TIME_DONE,
				min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME));
	} else {
		/* Sorry, we're out of memory, just CLOSE this
		 * socket up.  We've got bigger problems than
		 * non-graceful socket closings.
		 */
		tcp_set_state(sk, TCP_CLOSE);
	}

	/* Prevent rcvmsg/sndmsg calls, and wake people up. */
	sk->shutdown = SHUTDOWN_MASK;
	if(!sk->dead)
		sk->state_change(sk);
}

/*
 * 	Process the FIN bit. This now behaves as it is supposed to work
 *	and the FIN takes effect when it is validly part of sequence
 *	space. Not before when we get holes.
 *
 *	If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
 *	(and thence onto LAST-ACK and finally, CLOSE, we never enter
 *	TIME-WAIT)
 *
 *	If we are in FINWAIT-1, a received FIN indicates simultaneous
 *	close and we go into CLOSING (and later onto TIME-WAIT)
 *
 *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
 */
 
static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
{
	sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;

	tcp_send_ack(sk);

	switch(sk->state) {
		case TCP_SYN_RECV:
		case TCP_ESTABLISHED:
			/* Move to CLOSE_WAIT */
			tcp_set_state(sk, TCP_CLOSE_WAIT);
			if (th->rst)
				sk->shutdown = SHUTDOWN_MASK;
			break;

		case TCP_CLOSE_WAIT:
		case TCP_CLOSING:
			/* Received a retransmission of the FIN, do
			 * nothing.
			 */
			break;
		case TCP_LAST_ACK:
			/* RFC793: Remain in the LAST-ACK state. */
			break;

		case TCP_FIN_WAIT1:
			/* This case occurs when a simultaneous close
			 * happens, we must ack the received FIN and
			 * enter the CLOSING state.
			 *
			 * This causes a WRITE timeout, which will either
			 * move on to TIME_WAIT when we timeout, or resend
			 * the FIN properly (maybe we get rid of that annoying
			 * FIN lost hang). The TIME_WRITE code is already 
			 * correct for handling this timeout.
			 */
			tcp_set_state(sk, TCP_CLOSING);
			break;
		case TCP_FIN_WAIT2:
			/* Received a FIN -- send ACK and enter TIME_WAIT. */
			tcp_time_wait(sk);
			break;
		default:
			/* Only TCP_LISTEN and TCP_CLOSE are left, in these
			 * cases we should never reach this piece of code.
			 */
			printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
			break;
	};

	if (!sk->dead) {
		sk->state_change(sk);
		sock_wake_async(sk->socket, 1);
	}
}

/* These routines update the SACK block as out-of-order packets arrive or
 * in-order packets close up the sequence space.
 */
static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
{
	int this_sack, num_sacks = tp->num_sacks;
	struct tcp_sack_block *swalk = &tp->selective_acks[0];

	/* If more than one SACK block, see if the recent change to SP eats into
	 * or hits the sequence space of other SACK blocks, if so coalesce.
	 */
	if(num_sacks != 1) {
		for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
			if(swalk == sp)
				continue;

			/* First case, bottom of SP moves into top of the
			 * sequence space of SWALK.
			 */
			if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
				sp->start_seq = swalk->start_seq;
				goto coalesce;
			}
			/* Second case, top of SP moves into bottom of the
			 * sequence space of SWALK.
			 */
			if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
				sp->end_seq = swalk->end_seq;
				goto coalesce;
			}
		}
	}
	/* SP is the only SACK, or no coalescing cases found. */
	return;

coalesce:
	/* Zap SWALK, by moving every further SACK up by one slot.
	 * Decrease num_sacks.
	 */
	for(; this_sack < num_sacks-1; this_sack++, swalk++) {
		struct tcp_sack_block *next = (swalk + 1);
		swalk->start_seq = next->start_seq;
		swalk->end_seq = next->end_seq;
	}
	tp->num_sacks--;
}

static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
{
	__u32 tmp;

	tmp = sack1->start_seq;
	sack1->start_seq = sack2->start_seq;
	sack2->start_seq = tmp;

	tmp = sack1->end_seq;
	sack1->end_seq = sack2->end_seq;
	sack2->end_seq = tmp;
}

static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	struct tcp_sack_block *sp = &tp->selective_acks[0];
	int cur_sacks = tp->num_sacks;

	if (!cur_sacks)
		goto new_sack;

	/* Optimize for the common case, new ofo frames arrive
	 * "in order". ;-)  This also satisfies the requirements
	 * of RFC2018 about ordering of SACKs.
	 */
	if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
		sp->end_seq = TCP_SKB_CB(skb)->end_seq;
		tcp_sack_maybe_coalesce(tp, sp);
	} else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
		/* Re-ordered arrival, in this case, can be optimized
		 * as well.
		 */
		sp->start_seq = TCP_SKB_CB(skb)->seq;
		tcp_sack_maybe_coalesce(tp, sp);
	} else {
		struct tcp_sack_block *swap = sp + 1;
		int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4);

		/* Oh well, we have to move things around.
		 * Try to find a SACK we can tack this onto.
		 */

		for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
			if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
			   (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
				if(swap->end_seq == TCP_SKB_CB(skb)->seq)
					swap->end_seq = TCP_SKB_CB(skb)->end_seq;
				else
					swap->start_seq = TCP_SKB_CB(skb)->seq;
				tcp_sack_swap(sp, swap);
				tcp_sack_maybe_coalesce(tp, sp);
				return;
			}
		}

		/* Could not find an adjacent existing SACK, build a new one,
		 * put it at the front, and shift everyone else down.  We
		 * always know there is at least one SACK present already here.
		 *
		 * If the sack array is full, forget about the last one.
		 */
		if (cur_sacks >= max_sacks) {
			cur_sacks--;
			tp->num_sacks--;
		}
		while(cur_sacks >= 1) {
			struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
			struct tcp_sack_block *prev = (this - 1);
			this->start_seq = prev->start_seq;
			this->end_seq = prev->end_seq;
			cur_sacks--;
		}

	new_sack:
		/* Build the new head SACK, and we're done. */
		sp->start_seq = TCP_SKB_CB(skb)->seq;
		sp->end_seq = TCP_SKB_CB(skb)->end_seq;
		tp->num_sacks++;
	}
}

static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
{
	struct tcp_sack_block *sp = &tp->selective_acks[0];
	int num_sacks = tp->num_sacks;
	int this_sack;

	/* This is an in order data segment _or_ an out-of-order SKB being
	 * moved to the receive queue, so we know this removed SKB will eat
	 * from the front of a SACK.
	 */
	for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
		/* Check if the start of the sack is covered by skb. */
		if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
		   before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
			break;
	}

	/* This should only happen if so many SACKs get built that some get
	 * pushed out before we get here, or we eat some in sequence packets
	 * which are before the first SACK block.
	 */
	if(this_sack >= num_sacks)
		return;

	sp->start_seq = TCP_SKB_CB(skb)->end_seq;
	if(!before(sp->start_seq, sp->end_seq)) {
		/* Zap this SACK, by moving forward any other SACKS. */
		for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
			struct tcp_sack_block *next = (sp + 1);
			sp->start_seq = next->start_seq;
			sp->end_seq = next->end_seq;
		}
		tp->num_sacks--;
	}
}

static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
{
	struct tcp_sack_block *sp = &tp->selective_acks[0];
	int num_sacks = tp->num_sacks;
	int this_sack;

	for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
		if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
			break;
	}
	if(this_sack >= num_sacks)
		return;
	sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
}

/* This one checks to see if we can put data from the
 * out_of_order queue into the receive_queue.
 */
static void tcp_ofo_queue(struct sock *sk)
{
	struct sk_buff *skb;
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	while ((skb = skb_peek(&tp->out_of_order_queue))) {
		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
			break;

		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
			SOCK_DEBUG(sk, "ofo packet was already received \n");
			__skb_unlink(skb, skb->list);
			kfree_skb(skb);
			continue;
		}
		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
			   TCP_SKB_CB(skb)->end_seq);

		if(tp->sack_ok)
			tcp_sack_remove_skb(tp, skb);
		__skb_unlink(skb, skb->list);
		__skb_queue_tail(&sk->receive_queue, skb);
		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
		if(skb->h.th->fin)
			tcp_fin(skb, sk, skb->h.th);
	}
}

static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
	struct sk_buff *skb1;
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	/*  Queue data for delivery to the user.
	 *  Packets in sequence go to the receive queue.
	 *  Out of sequence packets to the out_of_order_queue.
	 */
	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
		/* Ok. In sequence. */
	queue_and_out:
		if (tcp_receive_window(tp) == 0)
			goto out_of_window;

		/* We know it is in window now too. */

		dst_confirm(sk->dst_cache);
		__skb_queue_tail(&sk->receive_queue, skb);
		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
		if(skb->h.th->fin) {
			tcp_fin(skb, sk, skb->h.th);
		} else {
			tcp_remember_ack(tp, skb->h.th, skb); 
		}
		/* This may have eaten into a SACK block. */
		if(tp->sack_ok && tp->num_sacks)
			tcp_sack_remove_skb(tp, skb);
		tcp_ofo_queue(sk);

		/* Turn on fast path. */ 
		if (skb_queue_len(&tp->out_of_order_queue) == 0)
			tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
					       (0x10 << 16) |
					       tp->snd_wnd);
		return;
	}
	
	/* An old packet, either a retransmit or some packet got lost. */
	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
		/* A retransmit, 2nd most common case.  Force an imediate ack. */
		SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
out_of_window:
		tcp_enter_quickack_mode(tp);
		tp->delayed_acks++;
		kfree_skb(skb);
		return;
	}

	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
		/* Partial packet, seq < rcv_next < end_seq */
		SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
			   TCP_SKB_CB(skb)->end_seq);

		goto queue_and_out;
	}

	/* Out of window. F.e. zero window probe.
	 *
	 * Note: it is highly possible that we may open window and enqueue
	 * this segment now. However, this will be known only after we queue
	 * it, which will result in queue full of successive 1 byte BSD
	 * window probes, it is SWS in fact. So, always reject it and send ACK.
	 */
	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt+tcp_receive_window(tp)))
		goto out_of_window;

	/* Ok. This is an out_of_order segment, force an ack. */
	tp->delayed_acks++;
	tcp_enter_quickack_mode(tp);

	/* Disable header prediction. */
	tp->pred_flags = 0;

	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);

	if (skb_peek(&tp->out_of_order_queue) == NULL) {
		/* Initial out of order segment, build 1 SACK. */
		if(tp->sack_ok) {
			tp->num_sacks = 1;
			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
			tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
		}
		__skb_queue_head(&tp->out_of_order_queue,skb);
	} else {
		for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
			/* Already there. */
			if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
				if (skb->len >= skb1->len) {
					if(tp->sack_ok)
						tcp_sack_extend(tp, skb1, skb);
					__skb_append(skb1, skb);
					__skb_unlink(skb1, skb1->list);
					kfree_skb(skb1);
				} else {
					/* A duplicate, smaller than what is in the
					 * out-of-order queue right now, toss it.
					 */
					kfree_skb(skb);
				}
				break;
			}
			
			if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
				__skb_append(skb1, skb);
				if(tp->sack_ok)
					tcp_sack_new_ofo_skb(sk, skb);
				break;
			}

                        /* See if we've hit the start. If so insert. */
			if (skb1 == skb_peek(&tp->out_of_order_queue)) {
				__skb_queue_head(&tp->out_of_order_queue,skb);
				if(tp->sack_ok)
					tcp_sack_new_ofo_skb(sk, skb);
				break;
			}
		}
	}
}


/*
 *	This routine handles the data.  If there is room in the buffer,
 *	it will be have already been moved into it.  If there is no
 *	room, then we will just have to discard the packet.
 */

static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
{
	struct tcphdr *th;
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	th = skb->h.th;
	skb_pull(skb, th->doff*4);
	skb_trim(skb, len - (th->doff*4));

	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
		return(0);

	/* 
	 *	If our receive queue has grown past its limits shrink it.
	 *	Make sure to do this before moving snd_nxt, otherwise
	 *	data might be acked for that we don't have enough room.
	 */
	if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { 
		if (prune_queue(sk) < 0) { 
			/* Still not enough room. That can happen when
			 * skb->true_size differs significantly from skb->len.
			 */
			return 0;
		}
	}

	tcp_data_queue(sk, skb);

	if (before(tp->rcv_nxt, tp->copied_seq)) {
		printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
		tp->rcv_nxt = tp->copied_seq;
	}

	/* Above, tcp_data_queue() increments delayed_acks appropriately.
	 * Now tell the user we may have some data.
	 */
	if (!sk->dead) {
		sk->data_ready(sk,0);
	}
	return(1);
}

static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
	    tcp_packets_in_flight(tp) < tp->snd_cwnd) {
		/* Put more data onto the wire. */
		tcp_write_xmit(sk);
	} else if (tp->packets_out == 0 && !tp->pending) {
		/* Start probing the receivers window. */
		tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
	}
}

static __inline__ void tcp_data_snd_check(struct sock *sk)
{
	struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head;

	if (skb != NULL)
		__tcp_data_snd_check(sk, skb); 
}

/* 
 * Adapt the MSS value used to make delayed ack decision to the 
 * real world. 
 */ 
static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	unsigned int len = skb->len, lss; 

	if (len > tp->rcv_mss) 
		tp->rcv_mss = len; 
	lss = tp->last_seg_size; 
	tp->last_seg_size = 0; 
	if (len >= 536) {
		if (len == lss) 
			tp->rcv_mss = len; 
		tp->last_seg_size = len; 
	}
}

/*
 * Check if sending an ack is needed.
 */
static __inline__ void __tcp_ack_snd_check(struct sock *sk)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	/* This also takes care of updating the window.
	 * This if statement needs to be simplified.
	 *
	 * Rules for delaying an ack:
	 *      - delay time <= 0.5 HZ
	 *      - we don't have a window update to send
	 *      - must send at least every 2 full sized packets
	 *	- must send an ACK if we have any out of order data
	 *
	 * With an extra heuristic to handle loss of packet
	 * situations and also helping the sender leave slow
	 * start in an expediant manner.
	 */

	    /* Two full frames received or... */
	if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
	    /* We will update the window "significantly" or... */
	    tcp_raise_window(sk) ||
	    /* We entered "quick ACK" mode or... */
	    tcp_in_quickack_mode(tp) ||
	    /* We have out of order data */
	    (skb_peek(&tp->out_of_order_queue) != NULL)) {
		/* Then ack it now */
		tcp_send_ack(sk);
	} else {
		/* Else, send delayed ack. */
		tcp_send_delayed_ack(tp, HZ/2);
	}
}

static __inline__ void tcp_ack_snd_check(struct sock *sk)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	if (tp->delayed_acks == 0) {
		/* We sent a data segment already. */
		return;
	}
	__tcp_ack_snd_check(sk);
}


/*
 *	This routine is only called when we have urgent data
 *	signalled. Its the 'slow' part of tcp_urg. It could be
 *	moved inline now as tcp_urg is only called from one
 *	place. We handle URGent data wrong. We have to - as
 *	BSD still doesn't use the correction from RFC961.
 *	For 1003.1g we should support a new option TCP_STDURG to permit
 *	either form (or just set the sysctl tcp_stdurg).
 */
 
static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	u32 ptr = ntohs(th->urg_ptr);

	if (ptr && !sysctl_tcp_stdurg)
		ptr--;
	ptr += ntohl(th->seq);

	/* Ignore urgent data that we've already seen and read. */
	if (after(tp->copied_seq, ptr))
		return;

	if (before(ptr, tp->rcv_nxt))
		return;

	/* Do we already have a newer (or duplicate) urgent pointer? */
	if (tp->urg_data && !after(ptr, tp->urg_seq))
		return;

	/* Tell the world about our new urgent pointer. */
	if (sk->proc != 0) {
		if (sk->proc > 0)
			kill_proc(sk->proc, SIGURG, 1);
		else
			kill_pg(-sk->proc, SIGURG, 1);
	}

	/* We may be adding urgent data when the last byte read was
	 * urgent. To do this requires some care. We cannot just ignore
	 * tp->copied_seq since we would read the last urgent byte again
	 * as data, nor can we alter copied_seq until this data arrives
	 * or we break the sematics of SIOCATMARK (and thus sockatmark())
	 */
	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
	    !sk->urginline &&
	    tp->copied_seq != tp->rcv_nxt)
		tp->copied_seq++;
	tp->urg_data = URG_NOTYET;
	tp->urg_seq = ptr;

	/* Disable header prediction. */
	tp->pred_flags = 0;
}

/* This is the 'fast' part of urgent handling. */
static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	/* Check if we get a new urgent pointer - normally not. */
	if (th->urg)
		tcp_check_urg(sk,th);

	/* Do we wait for any urgent data? - normally not... */
	if (tp->urg_data == URG_NOTYET) {
		u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4) - th->syn;

		/* Is the urgent pointer pointing into this packet? */	 
		if (ptr < len) {
			tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
			if (!sk->dead)
				sk->data_ready(sk,0);
		}
	}
}

/* Clean the out_of_order queue if we can, trying to get
 * the socket within its memory limits again.
 *
 * Return less than zero if we should start dropping frames
 * until the socket owning process reads some of the data
 * to stabilize the situation.
 */
static int prune_queue(struct sock *sk)
{
	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; 
	struct sk_buff * skb;

	SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);

	net_statistics.PruneCalled++; 

	/* First, purge the out_of_order queue. */
	skb = __skb_dequeue_tail(&tp->out_of_order_queue);
	if(skb != NULL) {
		/* Free it all. */
		do {	net_statistics.OfoPruned += skb->len; 
			kfree_skb(skb);
			skb = __skb_dequeue_tail(&tp->out_of_order_queue);
		} while(skb != NULL);

		/* Reset SACK state.  A conforming SACK implementation will
		 * do the same at a timeout based retransmit.  When a connection
		 * is in a sad state like this, we care only about integrity
		 * of the connection not performance.
		 */
		if(tp->sack_ok)
			tp->num_sacks = 0;
	}
	
	/* If we are really being abused, tell the caller to silently
	 * drop receive data on the floor.  It will get retransmitted
	 * and hopefully then we'll have sufficient space.
	 *
	 * We used to try to purge the in-order packets too, but that
	 * turns out to be deadly and fraught with races.  Consider:
	 *
	 * 1) If we acked the data, we absolutely cannot drop the
	 *    packet.  This data would then never be retransmitted.
	 * 2) It is possible, with a proper sequence of events involving
	 *    delayed acks and backlog queue handling, to have the user
	 *    read the data before it gets acked.  The previous code
	 *    here got this wrong, and it lead to data corruption.
	 * 3) Too much state changes happen when the FIN arrives, so once
	 *    we've seen that we can't remove any in-order data safely.
	 *
	 * The net result is that removing in-order receive data is too
	 * complex for anyones sanity.  So we don't do it anymore.  But
	 * if we are really having our buffer space abused we stop accepting
	 * new receive data.
	 */
	if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
		return 0;

	/* Massive buffer overcommit. */
	return -1;
}

/*
 *	TCP receive function for the ESTABLISHED state. 
 *
 *	It is split into a fast path and a slow path. The fast path is 
 * 	disabled when:
 *	- A zero window was announced from us - zero window probing
 *        is only handled properly in the slow path. 
 *      - Out of order segments arrived.
 *	- Urgent data is expected.
 *	- There is no buffer space left
 *	- Unexpected TCP flags/window values/header lengths are received
 *	  (detected by checking the TCP header against pred_flags) 
 *	- Data is sent in both directions. Fast path only supports pure senders
 *	  or pure receivers (this means either the sequence number or the ack
 *	  value must stay constant)
 *
 *	When these conditions are not satisfied it drops into a standard 
 *	receive procedure patterned after RFC793 to handle all cases.
 *	The first three cases are guaranteed by proper pred_flags setting,
 *	the rest is checked inline. Fast processing is turned on in 
 *	tcp_data_queue when everything is OK.
 */
int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
			struct tcphdr *th, unsigned len)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	int queued;
	u32 flg;

	/*
	 *	Header prediction.
	 *	The code follows the one in the famous 
	 *	"30 instruction TCP receive" Van Jacobson mail.
	 *	
	 *	Van's trick is to deposit buffers into socket queue 
	 *	on a device interrupt, to call tcp_recv function
	 *	on the receive process context and checksum and copy
	 *	the buffer to user space. smart...
	 *
	 *	Our current scheme is not silly either but we take the 
	 *	extra cost of the net_bh soft interrupt processing...
	 *	We do checksum and copy also but from device to kernel.
	 */

	/*
	 * RFC1323: H1. Apply PAWS check first.
	 */
	if (tcp_fast_parse_options(sk, th, tp)) {
		if (tp->saw_tstamp) {
			if (tcp_paws_discard(tp, th, len)) {
				tcp_statistics.TcpInErrs++;
				if (!th->rst) {
					tcp_send_ack(sk);
					goto discard;
				}
			}
			tcp_replace_ts_recent(sk, tp,
					      TCP_SKB_CB(skb)->seq,
					      TCP_SKB_CB(skb)->end_seq);
		}
	}

	flg = *(((u32 *)th) + 3) & ~htonl(0xFC8 << 16);

	/*	pred_flags is 0xS?10 << 16 + snd_wnd
	 *	if header_predition is to be made
	 *	'S' will always be tp->tcp_header_len >> 2
	 *	'?' will be 0 else it will be !0
	 *	(when there are holes in the receive 
	 *	 space for instance)
	 *	PSH flag is ignored.
         */

	if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
		if (len <= th->doff*4) {
			/* Bulk data transfer: sender */
			if (len == th->doff*4) {
				tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
					TCP_SKB_CB(skb)->ack_seq, len); 
				kfree_skb(skb); 
				tcp_data_snd_check(sk);
				return 0;
			} else { /* Header too small */
				tcp_statistics.TcpInErrs++;
				goto discard;
			}
		} else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
			   atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
			/* Bulk data transfer: receiver */
			__skb_pull(skb,th->doff*4);

			tcp_measure_rcv_mss(sk, skb); 

			/* DO NOT notify forward progress here.
			 * It saves dozen of CPU instructions in fast path. --ANK
			 */
			__skb_queue_tail(&sk->receive_queue, skb);
			tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;

			/* FIN bit check is not done since if FIN is set in
			 * this frame, the pred_flags won't match up. -DaveM
			 */
			sk->data_ready(sk, 0);
			tcp_delack_estimator(tp);

			tcp_remember_ack(tp, th, skb); 

			__tcp_ack_snd_check(sk);
			return 0;
		}
	}

	/*
	 *	Standard slow path.
	 */

	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
		/* RFC793, page 37: "In all states except SYN-SENT, all reset
		 * (RST) segments are validated by checking their SEQ-fields."
		 * And page 69: "If an incoming segment is not acceptable,
		 * an acknowledgment should be sent in reply (unless the RST bit
		 * is set, if so drop the segment and return)".
		 */
		if (th->rst)
			goto discard;
		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
			SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
				   TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
				   tp->rcv_wup, tp->rcv_wnd);
		}
		tcp_send_ack(sk);
		goto discard;
	}

	if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
		SOCK_DEBUG(sk, "syn in established state\n");
		tcp_statistics.TcpInErrs++;
		tcp_reset(sk);
		return 1;
	}
	
	if(th->rst) {
		tcp_reset(sk);
		goto discard;
	}

	if(th->ack)
		tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
	
	/* Process urgent data. */
	tcp_urg(sk, th, len);

	/* step 7: process the segment text */
	queued = tcp_data(skb, sk, len);

	/* This must be after tcp_data() does the skb_pull() to
	 * remove the header size from skb->len.
	 *
	 * Dave!!! Phrase above (and all about rcv_mss) has 
	 * nothing to do with reality. rcv_mss must measure TOTAL
	 * size, including sacks, IP options etc. Hence, measure_rcv_mss
	 * must occur before pulling etc, otherwise it will flap
	 * like hell. Even putting it before tcp_data is wrong,
	 * it should use skb->tail - skb->nh.raw instead.
	 *					--ANK (980805)
	 * 
	 * BTW I broke it. Now all TCP options are handled equally
	 * in mss_clamp calculations (i.e. ignored, rfc1122),
	 * and mss_cache does include all of them (i.e. tstamps)
	 * except for sacks, to calulate effective mss faster.
	 * 					--ANK (980805)
	 */
	tcp_measure_rcv_mss(sk, skb); 

	/* Be careful, tcp_data() may have put this into TIME_WAIT. */
	if(sk->state != TCP_CLOSE) {
		tcp_data_snd_check(sk);
		tcp_ack_snd_check(sk);
	}

	if (!queued) {
	discard:
		kfree_skb(skb);
	}

	return 0;
}

/* 
 *	Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented
 *	as an open_request. 
 */

struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, 
			   struct open_request *req)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	u32 flg;

	/*	assumption: the socket is not in use.
	 *	as we checked the user count on tcp_rcv and we're
	 *	running from a soft interrupt.
	 */

	/* Check for syn retransmission */
	flg = *(((u32 *)skb->h.th) + 3);
	
	flg &= __constant_htonl(0x00170000);
	/* Only SYN set? */
	if (flg == __constant_htonl(0x00020000)) {
		if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
			/*	retransmited syn.
			 */
			req->class->rtx_syn_ack(sk, req); 
			return NULL;
		} else {
			return sk; /* Pass new SYN to the listen socket. */
		}
	}

	/* We know it's an ACK here */	
	if (req->sk) {
		/*	socket already created but not
		 *	yet accepted()...
		 */
		sk = req->sk;
	} else {
		/* In theory the packet could be for a cookie, but
		 * TIME_WAIT should guard us against this. 
		 * XXX: Nevertheless check for cookies?
		 * This sequence number check is done again later,
		 * but we do it here to prevent syn flood attackers
		 * from creating big SYN_RECV sockets.
		 */ 
		if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) ||
		    !between(TCP_SKB_CB(skb)->seq, req->rcv_isn, 
			     req->rcv_isn+1+req->rcv_wnd)) {
			req->class->send_reset(skb);
			return NULL;
		}
	
		sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
		if (sk == NULL)
			return NULL;
		
		tcp_dec_slow_timer(TCP_SLT_SYNACK);
		req->expires = 0UL;
		req->sk = sk;
	}
	skb_orphan(skb); 
	skb_set_owner_r(skb, sk);
	return sk; 
}

/*
 *	This function implements the receiving procedure of RFC 793 for
 *	all states except ESTABLISHED and TIME_WAIT. 
 *	It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
 *	address independent.
 */
	
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
			  struct tcphdr *th, unsigned len)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
	int queued = 0;

	switch (sk->state) {
	case TCP_CLOSE:
		/* When state == CLOSED, hash lookup always fails.
		 *
		 * But, there is a back door, the backlog queue.
		 * If we have a sequence of packets in the backlog
		 * during __release_sock() which have a sequence such
		 * that:
		 *	packet X	causes entry to TCP_CLOSE state
		 *	...
		 *	packet X + N	has FIN bit set
		 *
		 * We report a (luckily) harmless error in this case.
		 * The issue is that backlog queue processing bypasses
		 * any hash lookups (we know which socket packets are for).
		 * The correct behavior here is what 2.0.x did, since
		 * a TCP_CLOSE socket does not exist.  Drop the frame
		 * and send a RST back to the other end.
		 */
		return 1;

	case TCP_LISTEN:
		/* These use the socket TOS.. 
		 * might want to be the received TOS 
		 */
		if(th->ack)
			return 1;
		
		if(th->syn) {
			if(tp->af_specific->conn_request(sk, skb, 0) < 0)
				return 1;

			/* Now we have several options: In theory there is 
			 * nothing else in the frame. KA9Q has an option to 
			 * send data with the syn, BSD accepts data with the
			 * syn up to the [to be] advertised window and 
			 * Solaris 2.1 gives you a protocol error. For now 
			 * we just ignore it, that fits the spec precisely 
			 * and avoids incompatibilities. It would be nice in
			 * future to drop through and process the data.
			 *
			 * Now that TTCP is starting to be used we ought to 
			 * queue this data.
			 * But, this leaves one open to an easy denial of
		 	 * service attack, and SYN cookies can't defend
			 * against this problem. So, we drop the data
			 * in the interest of security over speed.
			 */
			goto discard;
		}
		
		goto discard;
		break;

	case TCP_SYN_SENT:
		/* SYN sent means we have to look for a suitable ack and 
		 * either reset for bad matches or go to connected. 
		 * The SYN_SENT case is unusual and should
		 * not be in line code. [AC]
		 */
		if(th->ack) {
			/* rfc793:
			 * "If the state is SYN-SENT then
			 *    first check the ACK bit
			 *      If the ACK bit is set
			 *	  If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
			 *        a reset (unless the RST bit is set, if so drop
			 *        the segment and return)"
			 *
			 *  I cite this place to emphasize one essential
			 *  detail, this check is different of one
			 *  in established state: SND.UNA <= SEG.ACK <= SND.NXT.
			 *  SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
			 *  because we have no previous data sent before SYN.
			 *                                        --ANK(990513)
			 *
			 *  We do not send data with SYN, so that RFC-correct
			 *  test reduces to:
			 */
			if (sk->zapped ||
			    TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
				return 1;

			/* Now ACK is acceptable.
			 *
			 * "If the RST bit is set
			 *    If the ACK was acceptable then signal the user "error:
			 *    connection reset", drop the segment, enter CLOSED state,
			 *    delete TCB, and return."
			 */

			if (th->rst) {
				tcp_reset(sk);
				goto discard;
			}

			/* rfc793:
			 *   "fifth, if neither of the SYN or RST bits is set then
			 *    drop the segment and return."
			 *
			 *    See note below!
			 *                                        --ANK(990513)
		         */
			
			if (!th->syn)
				goto discard;

			/* rfc793:
			 *   "If the SYN bit is on ...
			 *    are acceptable then ...
			 *    (our SYN has been ACKed), change the connection
			 *    state to ESTABLISHED..."
			 *
			 * Do you see? SYN-less ACKs in SYN-SENT state are
			 * completely ignored.
			 *
			 * The bug causing stalled SYN-SENT sockets
			 * was here: tcp_ack advanced snd_una and canceled
			 * retransmit timer, so that bare ACK received
			 * in SYN-SENT state (even with invalid ack==ISS,
			 * because tcp_ack check is too weak for SYN-SENT)
			 * causes moving socket to invalid semi-SYN-SENT,
			 * semi-ESTABLISHED state and connection hangs.
			 *
			 * There exist buggy stacks, which really send
			 * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
			 * Actually, if this host did not try to get something
			 * from ftp.inr.ac.ru I'd never find this bug 8)
			 *
			 *                                     --ANK (990514)
			 */

			tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
			tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
				TCP_SKB_CB(skb)->ack_seq, len);

			/* Ok.. it's good. Set up sequence numbers and
			 * move to established.
			 */
			tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
			tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;

			/* RFC1323: The window in SYN & SYN/ACK segments is
			 * never scaled.
			 */
			tp->snd_wnd = htons(th->window);
			tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
			tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
			tp->syn_seq = TCP_SKB_CB(skb)->seq;
			tp->fin_seq = TCP_SKB_CB(skb)->seq;

			tcp_set_state(sk, TCP_ESTABLISHED);
			tcp_parse_options(sk, th, tp, 0);

        		if (tp->wscale_ok == 0) {
                		tp->snd_wscale = tp->rcv_wscale = 0;
                		tp->window_clamp = min(tp->window_clamp,65535);
        		}

			if (tp->tstamp_ok) {
				tp->tcp_header_len =
					sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
			} else
				tp->tcp_header_len = sizeof(struct tcphdr);
			if (tp->saw_tstamp) {
				tp->ts_recent = tp->rcv_tsval;
				tp->ts_recent_stamp = tcp_time_stamp;
			}

			/* Can't be earlier, doff would be wrong. */
			tcp_send_ack(sk);

			sk->dport = th->source;
			tp->copied_seq = tp->rcv_nxt;

			if(!sk->dead) {
				sk->state_change(sk);
				sock_wake_async(sk->socket, 0);
			}
		} else {
			if(th->syn && !th->rst) {
				/* The previous version of the code
				 * checked for "connecting to self"
				 * here. that check is done now in
				 * tcp_connect.
				 */
				tcp_set_state(sk, TCP_SYN_RECV);
				tcp_parse_options(sk, th, tp, 0);
				if (tp->saw_tstamp) {
					tp->ts_recent = tp->rcv_tsval;
					tp->ts_recent_stamp = tcp_time_stamp;
				}
				
				tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
				tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;

				/* RFC1323: The window in SYN & SYN/ACK segments is
				 * never scaled.
				 */
				tp->snd_wnd = htons(th->window);
				tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
				
				tcp_send_synack(sk);
			} else
				break; 
		}

		/* tp->tcp_header_len and tp->mss_clamp
		   probably changed, synchronize mss.
		   */
		tcp_sync_mss(sk, tp->pmtu_cookie);
		tp->rcv_mss = tp->mss_cache;

		/* Discard data/urg received with SYN. Safety is the first. */
		goto discard;
	}

	/*   Parse the tcp_options present on this header.
	 *   By this point we really only expect timestamps.
	 *   Note that this really has to be here and not later for PAWS
	 *   (RFC1323) to work.
	 */
	if (tcp_fast_parse_options(sk, th, tp)) {
		/* NOTE: assumes saw_tstamp is never set if we didn't
		 * negotiate the option. tcp_fast_parse_options() must
		 * guarantee this.
		 */
		if (tp->saw_tstamp) {
			if (tcp_paws_discard(tp, th, len)) {
				tcp_statistics.TcpInErrs++;
				if (!th->rst) {
					tcp_send_ack(sk);
					goto discard;
				}
			}
			tcp_replace_ts_recent(sk, tp,
					      TCP_SKB_CB(skb)->seq,
					      TCP_SKB_CB(skb)->end_seq);
		}
	}

	/* The silly FIN test here is necessary to see an advancing ACK in
	 * retransmitted FIN frames properly.  Consider the following sequence:
	 *
	 *	host1 --> host2		FIN XSEQ:XSEQ(0) ack YSEQ
	 *	host2 --> host1		FIN YSEQ:YSEQ(0) ack XSEQ
	 *	host1 --> host2		XSEQ:XSEQ(0) ack YSEQ+1
	 *	host2 --> host1		FIN YSEQ:YSEQ(0) ack XSEQ+1	(fails tcp_sequence test)
	 *
	 * At this point the connection will deadlock with host1 believing
	 * that his FIN is never ACK'd, and thus it will retransmit it's FIN
	 * forever.  The following fix is from Taral (taral@taral.net).
	 */

	/* step 1: check sequence number */
	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq) &&
	    !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)) {
		if (!th->rst) {
			tcp_send_ack(sk);
		}
		goto discard;
	}

	/* step 2: check RST bit */
	if(th->rst) {
		tcp_reset(sk);
		goto discard;
	}

	/* step 3: check security and precedence [ignored] */

	/*	step 4:
	 *
	 *	Check for a SYN, and ensure it matches the SYN we were
	 *	first sent. We have to handle the rather unusual (but valid)
	 *	sequence that KA9Q derived products may generate of
	 *
	 *	SYN
	 *				SYN|ACK Data
	 *	ACK	(lost)
	 *				SYN|ACK Data + More Data
	 *	.. we must ACK not RST...
	 *
	 *	We keep syn_seq as the sequence space occupied by the 
	 *	original syn. 
	 */

	if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
		tcp_reset(sk);
		return 1;
	}

	/* step 5: check the ACK field */
	if (th->ack) {
		int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
					 TCP_SKB_CB(skb)->ack_seq, len);
		
		switch(sk->state) {
		case TCP_SYN_RECV:
			if (acceptable) {
				tcp_set_state(sk, TCP_ESTABLISHED);
				sk->dport = th->source;
				tp->copied_seq = tp->rcv_nxt;

				if(!sk->dead)
					sk->state_change(sk);		

				tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
				tp->snd_wnd = htons(th->window) << tp->snd_wscale;
				tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
				tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;

			} else {
				SOCK_DEBUG(sk, "bad ack\n");
				return 1;
			}
			break;

		case TCP_FIN_WAIT1:
			if (tp->snd_una == tp->write_seq) {
				sk->shutdown |= SEND_SHUTDOWN;
				tcp_set_state(sk, TCP_FIN_WAIT2);
				if (!sk->dead)
					sk->state_change(sk);
				else
					tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
			}
			break;

		case TCP_CLOSING:	
			if (tp->snd_una == tp->write_seq) {
				tcp_time_wait(sk);
				goto discard;
			}
			break;

		case TCP_LAST_ACK:
			if (tp->snd_una == tp->write_seq) {
				sk->shutdown = SHUTDOWN_MASK;
				tcp_set_state(sk,TCP_CLOSE);
				if (!sk->dead)
					sk->state_change(sk);
				goto discard;
			}
			break;
		}
	} else
		goto discard;

	/* step 6: check the URG bit */
	tcp_urg(sk, th, len);

	/* step 7: process the segment text */
	switch (sk->state) {
	case TCP_CLOSE_WAIT:
	case TCP_CLOSING:
		if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
			break;
	
	case TCP_FIN_WAIT1:
	case TCP_FIN_WAIT2:
		/* RFC 793 says to queue data in these states,
		 * RFC 1122 says we MUST send a reset. 
		 * BSD 4.4 also does reset.
		 */
		if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
			if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
				tcp_reset(sk);
				return 1;
			}
		}
		
	case TCP_ESTABLISHED: 
		queued = tcp_data(skb, sk, len);

		/* This must be after tcp_data() does the skb_pull() to
		 * remove the header size from skb->len.
		 */
		tcp_measure_rcv_mss(sk, skb); 
		break;
	}

	tcp_data_snd_check(sk);
	tcp_ack_snd_check(sk);

	if (!queued) { 
discard:
		kfree_skb(skb);
	}
	return 0;
}
Elixir Cross Referencer

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.