tcp_input.c - net/ipv4/tcp_input.c - Linux source code 2.0.37pre8

/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
 * Version:	@(#)tcp_input.c	1.0.16	05/25/93
 *
 * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Matthew Dillon, <dillon@apollo.west.oic.com>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 *
 * FIXES
 *		Pedro Roque	:	Double ACK bug
 *		Eric Schenk	:	Fixes to slow start algorithm.
 *		Eric Schenk	:	Yet another double ACK bug.
 *		Eric Schenk	:	Delayed ACK bug fixes.
 *		Eric Schenk	:	Floyd style fast retrans war avoidance.
 *		Eric Schenk	: 	Skip fast retransmit on small windows.
 *		Eric Schenk	:	Fixes to retransmission code to
 *				:	avoid extra retransmission.
 *		Theodore Ts'o	:	Do secure TCP sequence numbers.
 *		Eric Schenk	:	SYN and RST cookies for dealing
 *				:	with SYN flooding attacks.
 *		David S. Miller	:	New socket lookup architecture for ISS.
 *					This code is dedicated to John Dyson.
 *              Elliot Poger    :       Added support for SO_BINDTODEVICE.
 *	Willy Konynenberg	:	Transparent proxy adapted to new
 *					socket hash code.
 *	J Hadi Salim		:	We assumed that some idiot wasnt going
 *	Alan Cox			to idly redefine bits of ToS in an
 *					experimental protocol for other things
 *					(ECN) - wrong!. Mask the bits off. Note
 *					masking the bits if they dont use ECN
 *					then use it for ToS is even more 
 *					broken. 
 *					</RANT>
 *	George Baeslack		:	SIGIO delivery on accept() bug that
 *					affected sun jdk.
 */

#include <linux/config.h>
#include <linux/types.h>
#include <linux/random.h>
#include <net/tcp.h>

/*
 *	Do we assume the IP ToS is entirely for its intended purpose
 */
 
#define TOS_VALID_MASK(x)		((x)&0x3F)
 
/*
 *	Policy code extracted so it's now separate
 */

/*
 *	Called each time to estimate the delayed ack timeout. This is
 *	how it should be done so a fast link isn't impacted by ack delay.
 */
 
extern __inline__ void tcp_delack_estimator(struct sock *sk)
{
	/*
	 *	Delayed ACK time estimator.
	 */
	
	if (sk->lrcvtime == 0) 
	{
		sk->lrcvtime = jiffies;
		sk->ato = HZ/3;
	}
	else 
	{
		int m;
		
		m = jiffies - sk->lrcvtime;

		sk->lrcvtime = jiffies;

		if (m <= 0)
			m = 1;

		/* This used to test against sk->rtt.
		 * On a purely receiving link, there is no rtt measure.
		 * The result is that we lose delayed ACKs on one-way links.
		 * Therefore we test against sk->rto, which will always
		 * at least have a default value.
		 */
		if (m > sk->rto)
		{
			sk->ato = sk->rto;
			/*
			 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
			 */
		}
		else 
		{
			/*
		 	 * Very fast acting estimator.
		 	 * May fluctuate too much. Probably we should be
			 * doing something like the rtt estimator here.
			 */
			sk->ato = (sk->ato >> 1) + m;
			/*
			 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
			 */
		}
	}
}

/*
 *	Called on frames that were known _not_ to have been
 *	retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 
 *	The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
 */
 
extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
{
	long m;
	/*
	 *	The following amusing code comes from Jacobson's
	 *	article in SIGCOMM '88.  Note that rtt and mdev
	 *	are scaled versions of rtt and mean deviation.
	 *	This is designed to be as fast as possible 
	 *	m stands for "measurement".
	 */
	
	m = jiffies - oskb->when;  /* RTT */

	if (sk->rtt != 0) {
		if(m<=0)
			m=1;		/* IS THIS RIGHT FOR <0 ??? */
		m -= (sk->rtt >> 3);    /* m is now error in rtt est */
		sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
		if (m < 0)
			m = -m;		/* m is now abs(error) */
		m -= (sk->mdev >> 2);   /* similar update on mdev */
		sk->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */
	} else {
		/* no previous measure. */
		sk->rtt = m<<3;		/* take the measured time to be rtt */
		sk->mdev = m<<1;	/* make sure rto = 3*rtt */
	}

	/*
	 *	Now update timeout.  Note that this removes any backoff.
	 */
			 
	/* Jacobson's algorithm calls for rto = R + 4V.
	 * We diverge from Jacobson's algorithm here. See the commentary
	 * in tcp_ack to understand why.
	 */
	sk->rto = (sk->rtt >> 3) + sk->mdev;
	sk->rto += (sk->rto>>2) + (sk->rto >> (sk->cong_window-1));
	if (sk->rto > 120*HZ)
		sk->rto = 120*HZ;
	if (sk->rto < HZ/5)	/* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
		sk->rto = HZ/5;
	sk->backoff = 0;
}

#if defined(CONFIG_RST_COOKIES)

/*
 * This code needs to be a bit more clever.
 * Does 300 second timeouts now. Still just a circular buffer.
 * At most 32 validations stored. New validations are ignored
 * if all 32 validations are currently valid. To do otherwise
 * allows a situation in which clearances are forgotten before
 * they can be used (provided valid traffic is coming fast enough).
 * The buffer should really be as long as the number of valid
 * connections we want to accept in an 300 second period.
 * 32 is maybe to small. On the other hand, the validation check
 * algorithm has to walk the whole table, which is also stupid.
 * It would be better to have a combined hash/circular buffer.
 * The hash could be used with chaining for fast lookup.
 * Really this is probably an argument against using RST cookies
 * at all, since they take up space for the clearances.
 */

static struct {
	u32 saddr;
	unsigned long tstamp;
} clearances[32] = {
{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}};

static next_clearance = 0;
/* Does the address saddr have an active security clearance? */
int tcp_clearance(__u32 saddr)
{
	int i;
	for (i = 0; i < 32; i++)
		if (clearances[i].saddr == saddr
		&& clearances[i].tstamp > jiffies-HZ*300)
			return 1;
	return 0;
}

void add_clearance(__u32 saddr)
{
	/*
	 * If expired then we can add a new entry.
	 */
	if (clearances[next_clearance].tstamp <= jiffies-HZ*300) {
		clearances[next_clearance].saddr = saddr;
		clearances[next_clearance].tstamp = jiffies;
		next_clearance = (next_clearance+1)%32;
	}
}

#endif

#ifdef CONFIG_SYN_COOKIES
/*
 *	MTU values we can represent in fall back mode.
 *	These values are partially borrowed from Jeff Weisberg's SunOS
 *	implementation of SYNCOOKIES. I have added an extra limiting
 * 	value of 64 to deal with the case of very small MTU values.
 *	(e.g. long delay packet radio links, 1200 baud modems.)
 */
static __u32 cookie_mtu[8] = { 64, 256, 512, 536, 1024, 1440, 1460, 4312 };
unsigned int ui_c_send_cookies = 0;
#endif

extern void tcp_v4_hash(struct sock *sk);
extern void tcp_v4_unhash(struct sock *sk);
extern void tcp_v4_rehash(struct sock *sk);

/* Don't inline this cruft.  Here are some nice properties to
 * exploit here.  The BSD API does not allow a listening TCP
 * to specify the remote port nor the remote address for the
 * connection.  So always assume those are both wildcarded
 * during the search since they can never be otherwise.
 */
static struct sock *tcp_v4_lookup_longway(u32 daddr, unsigned short hnum,
					  struct device *dev)
{
	struct sock *sk = tcp_listening_hash[tcp_lhashfn(hnum)];
	struct sock *result = NULL;
	int score, hiscore = 0;

	for(; sk; sk = sk->next) {
		if(sk->num == hnum) {
			__u32 rcv_saddr = sk->rcv_saddr;
			score = 1;

			/* If this socket is bound to a particular IP address,
			 * does the dest IPaddr of the packet match it?
			 */
			if(rcv_saddr) {
				if(rcv_saddr != daddr)
					continue;
				score++;
			}

			/* If this socket is bound to a particular interface,
			 * did the packet come in on it? */
			if (sk->bound_device) {
				if (dev != sk->bound_device)
					continue;
				score++;
			}

			/* Check the score--max is 3. */
			if (score == 3)
				return sk; /* Best possible match. */
			if (score > hiscore) {
				hiscore = score;
				result = sk;
			}
		}
	}
	return result;
}

/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 */
static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
					   u32 saddr, u16 sport, u32 daddr, 
					   u16 dport, struct device *dev)
{
	unsigned short hnum = ntohs(dport);
	struct sock *sk;

	/* Optimize here for direct hit, only listening connections can
	 * have wildcards anyways.  It is assumed that this code only
	 * gets called from within NET_BH.
	 */
	sk = tcp_established_hash[tcp_hashfn(daddr, hnum, saddr, sport)];
	for(; sk; sk = sk->next)
		if(sk->daddr		== saddr		&& /* remote address */
		   sk->dummy_th.dest	== sport		&& /* remote port    */
		   sk->num		== hnum			&& /* local port     */
		   sk->rcv_saddr	== daddr		&& /* local address  */
		   ((sk->bound_device==NULL) || (sk->bound_device==dev))  )
			goto hit; /* You sunk my battleship! */
	sk = tcp_v4_lookup_longway(daddr, hnum, dev);
hit:
	return sk;
}
 
__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport,
				      struct device *dev)
{
	return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dev);
}

#ifdef CONFIG_IP_TRANSPARENT_PROXY
/* I am not entirely sure this is fully equivalent to the old lookup code, but it does
 * look reasonable.  WFK
 */
struct sock *tcp_v4_proxy_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, u32 paddr, u16 rport,
				 struct device *dev)
{
	unsigned short hnum = ntohs(dport);
	unsigned short hrnum = ntohs(rport);
	struct sock *sk;

	/* Optimize here for direct hit, only listening connections can
	 * have wildcards anyways.  It is assumed that this code only
	 * gets called from within NET_BH.
	 */
	sk = tcp_established_hash[tcp_hashfn(daddr, hnum, saddr, sport)];
	for(; sk; sk = sk->next)
		if(sk->daddr		== saddr		&& /* remote address */
		   sk->dummy_th.dest	== sport		&& /* remote port    */
		   sk->num		== hnum			&& /* local port     */
		   sk->rcv_saddr	== daddr		&& /* local address  */
		   ((sk->bound_device==NULL) || (sk->bound_device==dev))  )
			goto hit; /* You sunk my battleship! */
	/* If we don't match on a bound socket, try to find one explicitly listening
	 * on the remote address (a proxy bind).
	 */
	sk = tcp_v4_lookup_longway(daddr, hnum, dev);
	/* If that didn't yield an exact match, look for a socket listening on the
	 * redirect port.
	 */
	if (!sk || sk->rcv_saddr != daddr) {
		sk = tcp_v4_lookup_longway(paddr, hrnum, dev);
	}
hit:
	return sk;
}
#endif

/*
 * React to a out-of-window TCP sequence number in an incoming packet
 */
 
static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, u32 end_seq,
	      struct device *dev)
{
	if (th->rst)
		return;

	/*
	 *	Send a reset if we get something not ours and we are
	 *	unsynchronized. Note: We don't do anything to our end. We
	 *	are just killing the bogus remote connection then we will
	 *	connect again and it will work (with luck).
	 */
  	 
	if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
	{
		tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev,0,255);
		return;
	}

	/*
	 * 	This packet is old news. Usually this is just a resend
	 * 	from the far end, but sometimes it means the far end lost
	 *	an ACK we sent, so we better send an ACK.
	 */
	/*
	 *	BEWARE! Unconditional answering by ack to out-of-window ack
	 *	can result in infinite exchange of empty acks.
	 *	This check cures bug, found by Michiel Boland, but
	 *	not another possible cases.
	 *	If we are in TCP_TIME_WAIT, we have already received
	 *	FIN, so that our peer need not window update. If our
	 *	ACK were lost, peer would retransmit his FIN anyway. --ANK
	 */
	if (sk->state != TCP_TIME_WAIT || ntohl(th->seq) != end_seq)
		tcp_send_ack(sk);
}

/*
 *	This functions checks to see if the tcp header is actually acceptable. 
 */
 
extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
{
	u32 end_window = sk->lastwin_seq + sk->window;
	return	/* if start is at end of window, end must be too (zero window) */
		(seq == end_window && seq == end_seq) ||
		/* if start is before end of window, check for interest */
		(before(seq, end_window) && !before(end_seq, sk->acked_seq));
}

/*
 *	When we get a reset we do this. This probably is a tcp_output routine
 *	really.
 */

static int tcp_reset(struct sock *sk, struct sk_buff *skb)
{
	sk->zapped = 1;
	/*
	 *	We want the right error as BSD sees it (and indeed as we do).
	 */
	switch (sk->state) {
	case TCP_TIME_WAIT:
		break;
	case TCP_SYN_SENT:
		sk->err = ECONNREFUSED;
		break;
	case TCP_CLOSE_WAIT:
		sk->err = EPIPE;
		break;
	default:
		sk->err = ECONNRESET;
	}
#ifdef CONFIG_TCP_RFC1337
	/*
	 *	Time wait assassination protection [RFC1337]
	 *
	 *	This is a good idea, but causes more sockets to take time to close.
	 *
	 *	Ian Heavens has since shown this is an inadequate fix for the protocol
	 *	bug in question.
	 */
	if(sk->state!=TCP_TIME_WAIT)
	{	
		tcp_set_state(sk,TCP_CLOSE);
		sk->shutdown = SHUTDOWN_MASK;
	}
#else	
	tcp_set_state(sk,TCP_CLOSE);
	sk->shutdown = SHUTDOWN_MASK;
#endif	
	if (!sk->dead) 
		sk->state_change(sk);
	kfree_skb(skb, FREE_READ);
	return(0);
}


/*
 *	Look for tcp options. Parses everything but only knows about MSS.
 *	This routine is always called with the packet containing the SYN.
 *	However it may also be called with the ack to the SYN.  So you
 *	can't assume this is always the SYN.  It's always called after
 *	we have set up sk->mtu to our own MTU.
 *
 *	We need at minimum to add PAWS support here. Possibly large windows
 *	as Linux gets deployed on 100Mb/sec networks.
 */
 
static void tcp_options(struct sock *sk, struct tcphdr *th)
{
	unsigned char *ptr;
	int length=(th->doff*4)-sizeof(struct tcphdr);
	int mss_seen = 0;
    
	ptr = (unsigned char *)(th + 1);
  
	while(length>0)
	{
	  	int opcode=*ptr++;
	  	int opsize=*ptr++;
	  	switch(opcode)
	  	{
	  		case TCPOPT_EOL:
	  			goto ende;
	  		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
	  			length--;
	  			ptr--;		/* the opsize=*ptr++ above was a mistake */
	  			continue;
	  		
	  		default:
	  			if(opsize<=2)	/* Avoid silly options looping forever */
	  				goto ende;
	  			switch(opcode)
	  			{
	  				case TCPOPT_MSS:
	  					if(opsize==4 && th->syn)
	  					{
	  						sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
							mss_seen = 1;
	  					}
	  					break;
		  				/* Add other options here as people feel the urge to implement stuff like large windows */
	  			}
	  			ptr+=opsize-2;
	  			length-=opsize;
	  	}
	}
ende:	if (th->syn) 
	{
		if (! mss_seen)
		      sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
	}
#ifdef CONFIG_INET_PCTCP
	sk->mss = min(sk->max_window >> 1, sk->mtu);
#else    
	sk->mss = min(sk->max_window, sk->mtu);
	sk->max_unacked = 2 * sk->mss;
#endif  
}


/*
 *	This routine handles a connection request.
 *	It should make sure we haven't already responded.
 *	Because of the way BSD works, we have to send a syn/ack now.
 *	This also means it will be harder to close a socket which is
 *	listening.
 */
 
static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
		 u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
{
	struct sock *newsk;
	struct tcphdr *th;
	struct rtable *rt;
#ifdef CONFIG_SYN_COOKIES
	int send_cookie = 0;
#endif

	th = skb->h.th;

	/* If the socket is dead, don't accept the connection. */
	if (!sk->dead) 
	{
		/*
		 * This must wait for 3 way completion.
  		 * sk->data_ready(sk,0);
  		 */
	}
	else 
	{
		if(sk->debug)
			printk("Reset on %p: Connect on dead socket.\n",sk);
		tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, 0,255);
		tcp_statistics.TcpAttemptFails++;
		kfree_skb(skb, FREE_READ);
		return;
	}

	/*
	 *	Make sure we can accept more.  This will prevent a
	 *	flurry of syns from eating up all our memory.
	 *
	 *	BSD does some funnies here and allows 3/2 times the
	 *	set backlog as a fudge factor. That's just too gross.
	 *
         *	Well, now I'm making things even grosser for dealing
	 *	with SYNACK flooding.
	 */

	if (sk->ack_backlog >= sk->max_ack_backlog)
	{
#if defined(CONFIG_RST_COOKIES) || defined(CONFIG_SYN_COOKIES)
		static unsigned long warning_time = 0;

		/* We may be experiencing SYNACK flooding.
		 * We now must decide if we should accept this connection.
		 * If we have a security clearance for the incoming
		 * packet, i.e. it is from a location we where talking
		 * to succesfully recently, or that has responded to
		 * a security probe, then we go ahead and deal normally,
		 * accepting up to 2*max in the backlog.
		 * Otherwise, we send out either an RST security probe
		 * or a SYN cookie, or both. (depending on configuration).
		 * Note that we send out a cookie even if the backlog
		 * is full up to 2*max, since the backlog may clear
		 * by the time we get a response.
		 * WARNING: This code changes the semantics of the backlog
		 * a bit. I'm not entirely sure this is the right thing
		 * to do here.
	 	 */
		extern void tcp_send_synack_probe(unsigned long saddr,
						  unsigned long daddr, struct tcphdr *th,
						  struct proto *prot,
						  struct options *opt,
						  struct device *dev, int tos, int ttl);

#ifdef CONFIG_RST_COOKIES
		if (!tcp_clearance(saddr)) {
#endif
			/* Only let this warning get printed once a minute. */
			if (jiffies - warning_time > HZ*60) {
				warning_time = jiffies;
				printk(KERN_INFO "Warning: possible SYN flood from %d.%d.%d.%d on %d.%d.%d.%d:%d.  Sending cookies.\n", 
					NIPQUAD(saddr), NIPQUAD(daddr), ntohs(th->dest));
			}
#ifdef CONFIG_RST_COOKIES
			tcp_send_synack_probe(daddr, saddr, th, &tcp_prot,
				opt, dev, skb->ip_hdr->tos, 255);
#endif
#ifdef CONFIG_SYN_COOKIES
			send_cookie = 1;
			ui_c_send_cookies++;
#else
			/* If we only have RST cookies we should
			 * not drop through to the rest of the response code.
			 */
			kfree_skb(skb, FREE_READ);
			return;
#endif
#ifdef CONFIG_RST_COOKIES
		} else if (sk->ack_backlog >= 2*sk->max_ack_backlog) {
			tcp_statistics.TcpAttemptFails++;
			kfree_skb(skb, FREE_READ);
			return;
		}
#endif
#else
		tcp_statistics.TcpAttemptFails++;
		kfree_skb(skb, FREE_READ);
		return;
#endif
	}

	/*
	 * We need to build a new sock struct.
	 * It is sort of bad to have a socket without an inode attached
	 * to it, but the wake_up's will just wake up the listening socket,
	 * and if the listening socket is destroyed before this is taken
	 * off of the queue, this will take care of it.
	 */

	newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
	if (newsk == NULL) 
	{
		/* just ignore the syn.  It will get retransmitted. */
		tcp_statistics.TcpAttemptFails++;
		kfree_skb(skb, FREE_READ);
		return;
	}

	memcpy(newsk, sk, sizeof(*newsk));

	/* Or else we die! -DaveM */
	newsk->sklist_next = NULL;
	/* and die again -- erics */
	newsk->pprev = NULL;

	newsk->opt = NULL;
	newsk->ip_route_cache  = NULL;
	if (opt && opt->optlen) 
	{
		sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
		if (!sk->opt) 
		{
	        	kfree_s(newsk, sizeof(struct sock));
			tcp_statistics.TcpAttemptFails++;
			kfree_skb(skb, FREE_READ);
			return;
		}
		if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) 
		{
			kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
	        	kfree_s(newsk, sizeof(struct sock));
			tcp_statistics.TcpAttemptFails++;
			kfree_skb(skb, FREE_READ);
			return;
		}
	}
	
	skb->when = jiffies;	/* For timeout */
	skb_queue_head_init(&newsk->write_queue);
	skb_queue_head_init(&newsk->receive_queue);
	newsk->send_head = NULL;
	newsk->send_tail = NULL;
	newsk->send_next = NULL;
	skb_queue_head_init(&newsk->back_log);
	newsk->rtt = 0;
	newsk->rto = TCP_TIMEOUT_INIT;
	newsk->mdev = TCP_TIMEOUT_INIT;
	newsk->max_window = 32;	/* It cannot be left at zero. -DaveM */
	/*
	 * See draft-stevens-tcpca-spec-01 for discussion of the
	 * initialization of these values.
	 */
	newsk->cong_window = 1;
	newsk->cong_count = 0;
	newsk->ssthresh = 0x7fffffff;

	newsk->lrcvtime = 0;
	newsk->idletime = 0;
	newsk->high_seq = 0;
	newsk->backoff = 0;
	newsk->blog = 0;
	newsk->intr = 0;
	newsk->proc = 0;
	newsk->done = 0;
	newsk->partial = NULL;
	newsk->pair = NULL;
	newsk->wmem_alloc = 0;
	newsk->rmem_alloc = 0;
	newsk->localroute = sk->localroute;

	newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;

	newsk->err = 0;
	newsk->shutdown = 0;
	newsk->ack_backlog = 0;
	newsk->acked_seq = skb->seq+1;
	newsk->lastwin_seq = skb->seq+1;
	newsk->delay_acks = 1;
	newsk->copied_seq = skb->seq+1;
	newsk->fin_seq = skb->seq;
	newsk->syn_seq = skb->seq;
	newsk->state = TCP_SYN_RECV;
	newsk->timeout = 0;
	newsk->ip_xmit_timeout = 0;
	newsk->urg_data = 0;
	newsk->retransmits = 0;
	newsk->linger=0;
	newsk->destroy = 0;
	init_timer(&newsk->timer);
	newsk->timer.data = (unsigned long)newsk;
	newsk->timer.function = &net_timer;
	init_timer(&newsk->delack_timer);
	newsk->delack_timer.data = (unsigned long)newsk;
	newsk->delack_timer.function = tcp_delack_timer;
	init_timer(&newsk->retransmit_timer);
	newsk->retransmit_timer.data = (unsigned long)newsk;
	newsk->retransmit_timer.function = tcp_retransmit_timer;
	newsk->dummy_th.source = skb->h.th->dest;
	newsk->dummy_th.dest = skb->h.th->source;
	newsk->users=0;
	
#ifdef CONFIG_IP_TRANSPARENT_PROXY
	/* 
	 *	Deal with possibly redirected traffic by setting num to
	 *	the intended destination port of the received packet.
	 */
	newsk->num = ntohs(skb->h.th->dest);

#endif
	/*
	 *	Swap these two, they are from our point of view. 
	 */
	 
	newsk->daddr = saddr;
	newsk->saddr = daddr;
	newsk->rcv_saddr = daddr;
#ifdef CONFIG_SYN_COOKIES
	/* Don't actually stuff the socket into the protocol lists
	 * if we are going to just destroy it anyway. We don't want any
	 * funnies happening if the next packet arrives before we get
	 * a chance to clean this one up.
	 */
	if (!send_cookie)
#endif
	{
		tcp_v4_hash(newsk);
		add_to_prot_sklist(newsk);
	}

	newsk->acked_seq = skb->seq + 1;
	newsk->copied_seq = skb->seq + 1;
	newsk->socket = NULL;
	newsk->listening = sk;

	/*
	 *	Grab the ttl and tos values and use them 
	 */

	newsk->ip_ttl=sk->ip_ttl;
	newsk->ip_tos=TOS_VALID_MASK(skb->ip_hdr->tos);

	/*
	 *	Use 512 or whatever user asked for 
	 */

	/*
	 * 	Note use of sk->user_mss, since user has no direct access to newsk 
	 */

	rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0,
			 sk->bound_device);
	newsk->ip_route_cache = rt;
	
	if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
		newsk->window_clamp = rt->rt_window;
	else
		newsk->window_clamp = 0;
		
	if (sk->user_mss)
		newsk->mtu = sk->user_mss;
	else if (rt)
		newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
	else 
		newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);

	/*
	 *	But not bigger than device MTU 
	 */

	newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));

	/* Must check it here, just to be absolutely safe.  If we end up
	 * with a newsk->{max_window,mtu} of zero, we can thus end up with
	 * a newsk->mss of zero, which causes us to bomb out in
	 * tcp_do_sendmsg. -DaveM
	 */
	if(newsk->mtu < 32)
		newsk->mtu = 32;

#ifdef CONFIG_SKIP
	
	/*
	 *	SKIP devices set their MTU to 65535. This is so they can take packets
	 *	unfragmented to security process then fragment. They could lie to the
	 *	TCP layer about a suitable MTU, but it's easier to let skip sort it out
	 *	simply because the final package we want unfragmented is going to be
	 *
	 *	[IPHDR][IPSP][Security data][Modified TCP data][Security data]
	 */
	 
	if(skip_pick_mtu!=NULL)		/* If SKIP is loaded.. */
		sk->mtu=skip_pick_mtu(sk->mtu,dev);
#endif
	/*
	 *	This will min with what arrived in the packet 
	 */

	tcp_options(newsk,skb->h.th);

#ifdef CONFIG_SYN_COOKIES
	if (send_cookie) {
		int mtu_index = 0;
		/* Pick the largest MTU smaller than sk->mtu that we
		 * can represent in a cookies bottom 3 bits.
		 */
		while (newsk->mtu > cookie_mtu[mtu_index+1] && mtu_index < 7)
			mtu_index++;
		newsk->mtu = cookie_mtu[mtu_index];
		/*
		 * Choose a cookie.
		 */
		seq = secure_tcp_syn_cookie(daddr,saddr,
        	        ntohs(th->source),ntohs(th->dest),ntohl(th->seq),jiffies/(60*HZ));
		seq |= mtu_index;
	}
#endif

	/* Set up the right sequence numbers */
	newsk->write_seq = seq; 
	newsk->window_seq = newsk->write_seq;
	newsk->rcv_ack_seq = newsk->write_seq;

#ifdef CONFIG_SYN_COOKIES
	tcp_send_synack(newsk, sk, skb, send_cookie);
#else
	tcp_send_synack(newsk, sk, skb, 0);
#endif
}


#ifdef CONFIG_SYN_COOKIES
/*
 *	This routine handles a faked connection request as a result
 *	of a valid SYN cookie being seen. This sets up a socket in the
 *	SYN_SENT state.
 */
 
static int tcp_conn_request_fake(struct sock *sk, struct sk_buff *skb,
		 u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq, u32 mtu)
{
	struct sock *newsk;
	struct sk_buff *newskb;
	struct rtable *rt;

	/* If the socket is dead, don't accept the connection. */
	if (!sk->dead) 
	{
  		/*sk->data_ready(sk,0); */
	}
	else 
	{
		if(sk->debug)
			printk("Reset on %p: Connect on dead socket.\n",sk);
		tcp_statistics.TcpAttemptFails++;
		return 0;
	}

	/*
	 * We need to build a new sock struct.
	 * It is sort of bad to have a socket without an inode attached
	 * to it, but the wake_up's will just wake up the listening socket,
	 * and if the listening socket is destroyed before this is taken
	 * off of the queue, this will take care of it.
	 */

	newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
	if (newsk == NULL) 
	{
		/* Bad juju. If we ignore things now the remote side
		 * will be frozen. Really we should retrans the cookie,
		 * but that's a no go also, since we don't have enough
		 * memory to receive it either. So, we're stuck with
		 * this bad case, and a few others further down.
		 * We just have to hope it is a low probability event.
		 * Also, to avoid a loop we must not go down into
		 * the recursive call to tcp_rcv in the caller to this
		 * routine, so we should let them know we failed.
		 */
		tcp_statistics.TcpAttemptFails++;
		return 0;
	}

	memcpy(newsk, sk, sizeof(*newsk));

	/* Or else we die! -DaveM */
	newsk->sklist_next = NULL;

	newsk->opt = NULL;
	newsk->ip_route_cache  = NULL;
	if (opt && opt->optlen) 
	{
		sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
		if (!sk->opt) 
		{
			/* More bad juju. */
	        	kfree_s(newsk, sizeof(struct sock));
			tcp_statistics.TcpAttemptFails++;
			return 0;
		}
		if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) 
		{
			/* More bad juju. */
			kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
	        	kfree_s(newsk, sizeof(struct sock));
			tcp_statistics.TcpAttemptFails++;
			return 0;
		}
	}
	
	skb_queue_head_init(&newsk->write_queue);
	skb_queue_head_init(&newsk->receive_queue);
	newsk->send_head = NULL;
	newsk->send_tail = NULL;
	newsk->send_next = NULL;
	skb_queue_head_init(&newsk->back_log);
	newsk->rtt = 0;
	newsk->rto = TCP_TIMEOUT_INIT;
	newsk->mdev = TCP_TIMEOUT_INIT;
	newsk->max_window = 32; /* It cannot be left at zero. -DaveM */
	/*
	 * See draft-stevens-tcpca-spec-01 for discussion of the
	 * initialization of these values.
	 */
	newsk->cong_window = 1;
	newsk->cong_count = 0;
	newsk->ssthresh = 0x7fffffff;

	newsk->lrcvtime = 0;
	newsk->idletime = 0;
	newsk->high_seq = 0;
	newsk->backoff = 0;
	newsk->blog = 0;
	newsk->intr = 0;
	newsk->proc = 0;
	newsk->done = 0;
	newsk->partial = NULL;
	newsk->pair = NULL;
	newsk->wmem_alloc = 0;
	newsk->rmem_alloc = 0;
	newsk->localroute = sk->localroute;

	newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;

	newsk->err = 0;
	newsk->shutdown = 0;
	newsk->ack_backlog = 0;
	newsk->acked_seq = skb->seq;
	newsk->lastwin_seq = skb->seq;
	newsk->delay_acks = 1;
	newsk->copied_seq = skb->seq;
	newsk->fin_seq = skb->seq-1;
	newsk->syn_seq = skb->seq-1;
	newsk->state = TCP_SYN_RECV;
	newsk->timeout = 0;
	newsk->ip_xmit_timeout = 0;
	newsk->urg_data = 0;
	newsk->retransmits = 0;
	newsk->linger=0;
	newsk->destroy = 0;
	init_timer(&newsk->timer);
	newsk->timer.data = (unsigned long)newsk;
	newsk->timer.function = &net_timer;
	init_timer(&newsk->delack_timer);
	newsk->delack_timer.data = (unsigned long)newsk;
	newsk->delack_timer.function = tcp_delack_timer;
	init_timer(&newsk->retransmit_timer);
	newsk->retransmit_timer.data = (unsigned long)newsk;
	newsk->retransmit_timer.function = tcp_retransmit_timer;
	newsk->dummy_th.source = skb->h.th->dest;
	newsk->dummy_th.dest = skb->h.th->source;
	newsk->users=0;
	
#ifdef CONFIG_IP_TRANSPARENT_PROXY
	/* 
	 *	Deal with possibly redirected traffic by setting num to
	 *	the intended destination port of the received packet.
	 */
	newsk->num = ntohs(skb->h.th->dest);

#endif
	/*
	 *	Swap these two, they are from our point of view. 
	 */
	 
	newsk->daddr = saddr;
	newsk->saddr = daddr;
	newsk->rcv_saddr = daddr;
	tcp_v4_hash(newsk);
	add_to_prot_sklist(newsk);

	newsk->acked_seq = skb->seq;
	newsk->copied_seq = skb->seq;
	newsk->socket = NULL;
	newsk->listening = sk;

	/*
	 *	Grab the ttl and tos values and use them 
	 */

	newsk->ip_ttl=sk->ip_ttl;
	newsk->ip_tos=TOS_VALID_MASK(skb->ip_hdr->tos);

	rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0,
			 sk->bound_device);
	newsk->ip_route_cache = rt;
	
	if (rt!=NULL && (rt->rt_flags&RTF_WINDOW))
		newsk->window_clamp = rt->rt_window;
	else
		newsk->window_clamp = 0;

	newsk->mtu = mtu;

	/* Set up the right sequence numbers.
	 * Note that we have to make sure write_seq is correct for having
 	 * sent off the handshake!
	 */
	newsk->write_seq = seq+1; 
	newsk->sent_seq = seq+1;
	newsk->window_seq = seq;
	newsk->rcv_ack_seq = seq;
	newsk->max_unacked = 2 * newsk->mss;

	tcp_select_window(newsk);

	/* We need to get something into the receive queue to enable an
	 * accept. Possibly we should be faking up a SYN packet, but
	 * as far as I can tell the contents of this skb don't matter,
	 * so long as it points to our new socket.
	 */
	newskb = skb_clone(skb,GFP_ATOMIC);
	newskb->sk = newsk;
	atomic_add(skb->truesize, &newsk->rmem_alloc);
	sk->ack_backlog++;
	skb_queue_tail(&sk->receive_queue,newskb);
	return 1;
}
#endif

/*
 * Handle a TCP window that shrunk on us. It shouldn't happen,
 * but..
 *
 * We may need to move packets from the send queue
 * to the write queue, if the window has been shrunk on us.
 * The RFC says you are not allowed to shrink your window
 * like this, but if the other end does, you must be able
 * to deal with it.
 */
void tcp_window_shrunk(struct sock * sk, u32 window_seq)
{
	struct sk_buff *skb;
	struct sk_buff *skb2;
	struct sk_buff *wskb = NULL;
 	
	skb2 = sk->send_head;
	sk->send_head = NULL;
	sk->send_tail = NULL;
	sk->send_next = NULL;

	/*
	 *	This is an artifact of a flawed concept. We want one
	 *	queue and a smarter send routine when we send all.
	 */
	cli();
	while (skb2 != NULL) 
	{
		skb = skb2;
		skb2 = skb->link3;
		skb->link3 = NULL;
		if (after(skb->end_seq, window_seq)) 
		{
			if (sk->packets_out > 0) 
				sk->packets_out--;
			/* We may need to remove this from the dev send list. */
			if (skb->next != NULL) 
			{
				skb_unlink(skb);				
			}
			/* Now add it to the write_queue. */
			if (wskb == NULL)
				skb_queue_head(&sk->write_queue,skb);
			else
				skb_append(wskb,skb);
			wskb = skb;
		} 
		else 
		{
			if (sk->send_head == NULL) 
			{
				sk->send_head = skb;
				sk->send_tail = skb;
				sk->send_next = skb;
			}
			else
			{
				sk->send_tail->link3 = skb;
				sk->send_tail = skb;
			}
			skb->link3 = NULL;
		}
	}
	sti();
}


/*
 *	This routine deals with incoming acks, but not outgoing ones.
 *
 *	This routine is totally _WRONG_. The list structuring is wrong,
 *	the algorithm is wrong, the code is wrong.
 */

static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
{
	int flag = 0;
	u32 window_seq;

	/* 
	 * 1 - there was data in packet as well as ack or new data is sent or 
	 *     in shutdown state
	 * 2 - data from retransmit queue was acked and removed
	 * 4 - window shrunk or data from retransmit queue was acked and removed
	 */

	if(sk->zapped)
		return(1);	/* Dead, can't ack any more so why bother */

	/*
	 *	We have dropped back to keepalive timeouts. Thus we have
	 *	no retransmits pending.
	 */
	 
	if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
	  	sk->retransmits = 0;

	/*
	 *	If the ack is newer than sent or older than previous acks
	 *	then we can probably ignore it.
	 */
	 
	if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
		goto uninteresting_ack;

	/*
	 *	Have we discovered a larger window
	 */
	window_seq = ntohs(th->window);
	if (window_seq > sk->max_window) 
	{
  		sk->max_window = window_seq;
#ifdef CONFIG_INET_PCTCP
		/* Hack because we don't send partial packets to non SWS
		   handling hosts */
		sk->mss = min(window_seq>>1, sk->mtu);
#else
		sk->mss = min(window_seq, sk->mtu);
#endif	
	}
	window_seq += ack;

	/*
	 *	See if our window has been shrunk. 
	 */
	if (after(sk->window_seq, window_seq))
		tcp_window_shrunk(sk, window_seq);

	/*
	 *	Pipe has emptied
	 */	 
	if (sk->send_tail == NULL || sk->send_head == NULL) 
	{
		sk->send_head = NULL;
		sk->send_tail = NULL;
		sk->send_next = NULL;
		sk->packets_out= 0;
	}

	/*
	 *	We don't want too many packets out there. 
	 */
	 
	if (sk->ip_xmit_timeout == TIME_WRITE && 
		sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
	{
		
		/* 
		 * This is Jacobson's slow start and congestion avoidance. 
		 * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
		 * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
		 * counter and increment it once every cwnd times.  It's possible
		 * that this should be done only if sk->retransmits == 0.  I'm
		 * interpreting "new data is acked" as including data that has
		 * been retransmitted but is just now being acked.
		 */
		if (sk->cong_window <= sk->ssthresh)
			/* 
			 *	In "safe" area, increase
			 */
			sk->cong_window++;
		else 
		{
			/*
			 *	In dangerous area, increase slowly.  In theory this is
			 *  	sk->cong_window += 1 / sk->cong_window
			 */
			if (sk->cong_count >= sk->cong_window) 
			{
				sk->cong_window++;
				sk->cong_count = 0;
			}
			else 
				sk->cong_count++;
		}
	}

	/*
	 *	Remember the highest ack received and update the
	 *	right hand window edge of the host.
	 *	We do a bit of work here to track number of times we've
	 *	seen this ack without a change in the right edge of the
	 *	window and no data in the packet.
	 *	This will allow us to do fast retransmits.
	 */

	/* We are looking for duplicate ACKs here.
	 * An ACK is a duplicate if:
	 * (1) it has the same sequence number as the largest number we've seen,
	 * (2) it has the same window as the last ACK,
	 * (3) we have outstanding data that has not been ACKed
	 * (4) The packet was not carrying any data.
	 * (5) [From Floyd's paper on fast retransmit wars]
	 *     The packet acked data after high_seq;
	 * I've tried to order these in occurrence of most likely to fail
	 * to least likely to fail.
	 * [These are an extension of the rules BSD stacks use to
	 *  determine if an ACK is a duplicate.]
	 */

	if (sk->rcv_ack_seq == ack
		&& sk->window_seq == window_seq
		&& len == th->doff*4
		&& before(ack, sk->sent_seq)
		&& after(ack, sk->high_seq))
	{
		/* Prevent counting of duplicate ACKs if the congestion
		 * window is smaller than 3. Note that since we reduce
		 * the congestion window when we do a fast retransmit,
		 * we must be careful to keep counting if we were already
		 * counting. The idea behind this is to avoid doing
		 * fast retransmits if the congestion window is so small
		 * that we cannot get 3 ACKs due to the loss of a packet
		 * unless we are getting ACKs for retransmitted packets.
		 */
		if (sk->cong_window >= 3 || sk->rcv_ack_cnt > MAX_DUP_ACKS+1)
			sk->rcv_ack_cnt++;
		/* See draft-stevens-tcpca-spec-01 for explanation
		 * of what we are doing here.
		 */
		if (sk->rcv_ack_cnt == MAX_DUP_ACKS+1) {
			int tmp;

			/* We need to be a bit careful to preserve the
			 * count of packets that are out in the system here.
			 */
			sk->ssthresh = max(
				min(sk->cong_window,
				(sk->window_seq-sk->rcv_ack_seq)/max(sk->mss,1))
				 >> 1, 2);
			sk->cong_window = sk->ssthresh+MAX_DUP_ACKS+1;
			sk->cong_count = 0;
			tmp = sk->packets_out;
			tcp_do_retransmit(sk,0);
			sk->packets_out = tmp;
		} else if (sk->rcv_ack_cnt > MAX_DUP_ACKS+1) {
			sk->cong_window++;
			/*
			* At this point we are suppose to transmit a NEW
			* packet (not retransmit the missing packet,
			* this would only get us into a retransmit war.)
			* I think that having just adjusted cong_window
			* we will transmit the new packet below.
			*/
		}
	}
	else
	{
		if (sk->rcv_ack_cnt > MAX_DUP_ACKS) {
			/* Don't allow congestion window to drop to zero. */
			sk->cong_window = max(sk->ssthresh, 1);
			sk->cong_count = 0;
		}
		sk->window_seq = window_seq;
		sk->rcv_ack_seq = ack;
		sk->rcv_ack_cnt = 1;
	}
	
	/*
	 *	We passed data and got it acked, remove any soft error
	 *	log. Something worked...
	 */
	 
	sk->err_soft = 0;

	/*
	 *	If this ack opens up a zero window, clear backoff.  It was
	 *	being used to time the probes, and is probably far higher than
	 *	it needs to be for normal retransmission.
	 */

	if (sk->ip_xmit_timeout == TIME_PROBE0) 
	{
		sk->retransmits = 0;	/* Our probe was answered */
		
		/*
		 *	Was it a usable window open ?
		 */
		 
  		if (!skb_queue_empty(&sk->write_queue) &&   /* should always be true */
		    ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
		{
			sk->backoff = 0;
			
			/*
			 *	Recompute rto from rtt.  this eliminates any backoff.
			 */

			/*
			 * Appendix C of Van Jacobson's final version of
			 * the SIGCOMM 88 paper states that although
			 * the original paper suggested that
			 *  RTO = R*2V
			 * was the correct calculation experience showed
			 * better results using
			 *  RTO = R*4V
			 * In particular this gives better performance over
			 * slow links, and should not effect fast links.
			 *
			 * Note: Jacobson's algorithm is fine on BSD which
			 * has a 1/2 second granularity clock, but with our
			 * 1/100 second granularity clock we become too
	 		 * sensitive to minor changes in the round trip time.
			 * We add in two compensating factors.
			 * First we multiply by 5/4. For large congestion
			 * windows this allows us to tolerate burst traffic
			 * delaying up to 1/4 of our packets.
			 * We also add in a rtt / cong_window term.
			 * For small congestion windows this allows
			 * a single packet delay, but has negligible effect
			 * on the compensation for large windows.
	 		 */
			sk->rto = (sk->rtt >> 3) + sk->mdev;
			sk->rto += (sk->rto>>2) + (sk->rto >> (sk->cong_window-1));
			if (sk->rto > 120*HZ)
				sk->rto = 120*HZ;
			if (sk->rto < HZ/5)	/* Was 1*HZ, then 1 - turns out we must allow about
						   .2 of a second because of BSD delayed acks - on a 100Mb/sec link
						   .2 of a second is going to need huge windows (SIGH) */
			sk->rto = HZ/5;
		}
	}

	/* 
	 *	See if we can take anything off of the retransmit queue.
	 */

	for (;;) {
		int was_locked;
		struct sk_buff * skb = sk->send_head;
		if (!skb)
			break;

		/* Check for a bug. */
		if (skb->link3 && after(skb->end_seq, skb->link3->end_seq)) 
			printk("INET: tcp.c: *** bug send_list out of order.\n");
			
		/*
		 *	If our packet is before the ack sequence we can
		 *	discard it as it's confirmed to have arrived the other end.
		 */
		 
		if (after(skb->end_seq, ack))
			break;

		if (sk->retransmits) 
		{
			/*
			 *	We were retransmitting.  don't count this in RTT est 
			 */
			flag |= 2;
		}

		if ((sk->send_head = skb->link3) == NULL)
		{
			sk->send_tail = NULL;
			sk->send_next = NULL;
			sk->retransmits = 0;
		}

		/*
		 * advance the send_next pointer if needed.
		 */
		if (sk->send_next == skb)
			sk->send_next = sk->send_head;

		/*
		 * Note that we only reset backoff and rto in the
		 * rtt recomputation code.  And that doesn't happen
		 * if there were retransmissions in effect.  So the
		 * first new packet after the retransmissions is
		 * sent with the backoff still in effect.  Not until
		 * we get an ack from a non-retransmitted packet do
		 * we reset the backoff and rto.  This allows us to deal
		 * with a situation where the network delay has increased
		 * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
		 */

		/*
		 *	We have one less packet out there. 
		 */
			 
		if (sk->packets_out > 0) 
			sk->packets_out --;

		/* This is really only supposed to be called when we
		 * are actually ACKing new data, which should exclude
		 * the ACK handshake on an initial SYN packet as well.
		 * Rather than introducing a new test here for this
		 * special case, we just reset the initial values for
		 * rtt immediately after we move to the established state.
		 */
		if (!(flag&2)) 	/* Not retransmitting */
			tcp_rtt_estimator(sk,skb);
		IS_SKB(skb);

		/*
		 *	We may need to remove this from the dev send list. 
		 */
		cli();
		was_locked = skb_device_locked(skb);

		if (was_locked) {
			/* In this case, we are relying on the fact that kfree_skb
			 * will just set the free flag to be 3, and increment
			 * a counter. It will not actually free anything, and 
			 * will not take much time
			 */
			kfree_skb(skb, FREE_WRITE);	
		} else {
			skb_unlink(skb);
		}
		sti();

		if (!was_locked)
		    kfree_skb(skb, FREE_WRITE); /* write. */
		if (!sk->dead)
			sk->write_space(sk);
	}

	/*
	 * Maybe we can take some stuff off of the write queue,
	 * and put it onto the xmit queue.
	 * There is bizarre case being tested here, to check if
	 * the data at the head of the queue ends before the start of
	 * the sequence we already ACKed. This is not an error,
	 * it can occur when we send a packet directly off of the write_queue
	 * in a zero window probe.
	 */

	if (!skb_queue_empty(&sk->write_queue) &&
	    	!before(sk->window_seq, sk->write_queue.next->end_seq) &&
		(sk->retransmits == 0 || 
		 sk->ip_xmit_timeout != TIME_WRITE ||
		 !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq)) &&
		sk->packets_out < sk->cong_window)
	{
		/*
		 *	Add more data to the send queue.
		 */
		tcp_write_xmit(sk);
	}

	/*
	 * Reset timers to reflect the new state.
	 *
	 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
	 * from TCP_CLOSE we don't do anything
	 *
	 * from anything else, if there is queued data (or fin) pending,
	 * we use a TIME_WRITE timeout, if there is data to write but
	 * no room in the window we use TIME_PROBE0, else if keepalive
	 * we reset to a KEEPALIVE timeout, else we delete the timer.
	 *
	 * We do not set flag for nominal write data, otherwise we may
	 * force a state where we start to write itsy bitsy tidbits
	 * of data.
	 */

	switch(sk->state) {
	case TCP_TIME_WAIT:
		/*
		 * keep us in TIME_WAIT until we stop getting packets,
		 * reset the timeout.
		 */
		tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
		break;
	case TCP_CLOSE:
		/*
		 * don't touch the timer.
		 */
		break;
	default:
		/*
		 * 	Must check send_head and write_queue
		 * 	to determine which timeout to use.
		 */
		if (sk->send_head) {
			tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
		} else if (!skb_queue_empty(&sk->write_queue)
			&& sk->ack_backlog == 0)
		{
			/* 
			 * if the write queue is not empty when we get here
			 * then we failed to move any data to the retransmit
			 * queue above. (If we had send_head would be non-NULL).
			 * Furthermore, since the send_head is NULL here
			 * we must not be in retransmit mode at this point.
			 * This implies we have no packets in flight,
			 * hence sk->packets_out < sk->cong_window.
			 * Examining the conditions for the test to move
			 * data to the retransmission queue we find that
			 * we must therefore have a zero window.
			 * Hence, if the ack_backlog is 0 we should initiate
			 * a zero probe.
			 * We don't do a zero probe if we have a delayed
			 * ACK in hand since the other side may have a
			 * window opening, but they are waiting to hear
			 * from us before they tell us about it.
			 * (They are applying Nagle's rule).
			 * So, we don't set up the zero window probe
			 * just yet. We do have to clear the timer
			 * though in this case...
			 */
			tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
		} else if (sk->keepopen) {
			tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
		} else {
			del_timer(&sk->retransmit_timer);
			sk->ip_xmit_timeout = 0;
		}
		break;
	}

	/*
	 * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
	 * we are now waiting for an acknowledge to our FIN.  The other end is
	 * already in TIME_WAIT.
	 *
	 * Move to TCP_CLOSE on success.
	 */

	if (sk->state == TCP_LAST_ACK) 
	{
		if (!sk->dead)
			sk->state_change(sk);
		if(sk->debug)
			printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
				sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
		if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
		{
			sk->shutdown = SHUTDOWN_MASK;
			tcp_set_state(sk,TCP_CLOSE);
			return 1;
		}
	}

	/*
	 *	Incoming ACK to a FIN we sent in the case of our initiating the close.
	 *
	 *	Move to FIN_WAIT2 to await a FIN from the other end. Set
	 *	SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
	 */

	if (sk->state == TCP_FIN_WAIT1) 
	{

		if (!sk->dead) 
			sk->state_change(sk);
		if (sk->rcv_ack_seq == sk->write_seq) 
		{
			sk->shutdown |= SEND_SHUTDOWN;
			tcp_set_state(sk, TCP_FIN_WAIT2);
			/* If the socket is dead, then there is no
			 * user process hanging around using it.
			 * We want to set up a FIN_WAIT2 timeout ala BSD.
			 */
			if (sk->dead)
				tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
		}
	}

	/*
	 *	Incoming ACK to a FIN we sent in the case of a simultaneous close.
	 *
	 *	Move to TIME_WAIT
	 */

	if (sk->state == TCP_CLOSING) 
	{

		if (!sk->dead) 
			sk->state_change(sk);
		if (sk->rcv_ack_seq == sk->write_seq) 
		{
			tcp_time_wait(sk);
		}
	}
	
	/*
	 *	Final ack of a three way shake 
	 */
	 
	if (sk->state==TCP_SYN_RECV)
	{
		tcp_set_state(sk, TCP_ESTABLISHED);
		
		/*
		 *	We have a listening socket owning us. Wake it for
		 *	the accept.
		 */
		 
  		if ( sk->listening ) 
  		{
  			/* The listener may be sk->dead. Dont worry 
  			   data_ready traps this */
			sk->data_ready(sk->listening,0);
			sk->listening = NULL;
		}			

		/* Must check for peer advertising zero sized window
		 * or else we get a sk->{mtu,mss} of zero and thus bomb out
		 * in tcp_do_sendmsg. -DaveM
		 */
		if(sk->max_window == 0)
			sk->max_window = 32;

		tcp_options(sk,th);

#if 0
		sk->dummy_th.dest=th->source;
		tcp_v4_rehash(sk);
#endif

		sk->copied_seq = sk->acked_seq;
		if(!sk->dead)
			sk->state_change(sk);

		/* Reset the RTT estimator to the initial
		 * state rather than testing to avoid
		 * updating it on the ACK to the SYN packet.
		 */
		sk->rtt = 0;
		sk->rto = TCP_TIMEOUT_INIT;
		sk->mdev = TCP_TIMEOUT_INIT;
	}
	
	/*
	 * The following code has been greatly simplified from the
	 * old hacked up stuff. The wonders of properly setting the
	 * retransmission timeouts.
	 *
	 * If we are retransmitting, and we acked a packet on the retransmit
	 * queue, and there is still something in the retransmit queue,
	 * then we can output some retransmission packets.
	 *
	 * Note that we need to be a bit careful here about getting the
	 * correct TIME_WRITE timer set. If we just got an ack of a
	 * packet we where retransmitting, we will retransmit the next
	 * packet in the retransmit queue below, and the timeout
	 * should now start from the time we retransmitted that packet.
	 * The resetting of the TIME_WRITE timer above will have set it
	 * relative to the prior transmission time, which would be wrong.
	 */

	if (sk->send_head != NULL && (flag&2) && sk->retransmits)
	{
		tcp_do_retransmit(sk, 1);
		tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
	}

	return 1;

uninteresting_ack:
	if(sk->debug)
		printk("Ack ignored %u %u\n",ack,sk->sent_seq);
			
	/*
	 *	Keepalive processing.
	 */
		 
	if (after(ack, sk->sent_seq)) 
	{
		return 0;
	}
		
	/*
	 *	Restart the keepalive timer.
	 */
		 
	if (sk->keepopen) 
	{
		if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
			tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
	}
	return 1;
}


/*
 * 	Process the FIN bit. This now behaves as it is supposed to work
 *	and the FIN takes effect when it is validly part of sequence
 *	space. Not before when we get holes.
 *
 *	If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
 *	(and thence onto LAST-ACK and finally, CLOSE, we never enter
 *	TIME-WAIT)
 *
 *	If we are in FINWAIT-1, a received FIN indicates simultaneous
 *	close and we go into CLOSING (and later onto TIME-WAIT)
 *
 *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
 *
 */
 
static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
{
	sk->fin_seq = skb->end_seq;

	if (!sk->dead) 
	{
		sk->state_change(sk);
		sock_wake_async(sk->socket, 1);
	}

	switch(sk->state) 
	{
		case TCP_SYN_RECV:
		case TCP_SYN_SENT:
		case TCP_ESTABLISHED:
			/*
			 * move to CLOSE_WAIT, tcp_data() already handled
			 * sending the ack.
			 */
			tcp_set_state(sk,TCP_CLOSE_WAIT);
			if (th->rst)
				sk->shutdown = SHUTDOWN_MASK;
			break;

		case TCP_CLOSE_WAIT:
		case TCP_CLOSING:
			/*
			 * received a retransmission of the FIN, do
			 * nothing.
			 */
			break;
		case TCP_TIME_WAIT:
			/*
			 * received a retransmission of the FIN,
			 * restart the TIME_WAIT timer.
			 */
			tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
			return(0);
		case TCP_FIN_WAIT1:
			/*
			 * This case occurs when a simultaneous close
			 * happens, we must ack the received FIN and
			 * enter the CLOSING state.
			 *
			 * This causes a WRITE timeout, which will either
			 * move on to TIME_WAIT when we timeout, or resend
			 * the FIN properly (maybe we get rid of that annoying
			 * FIN lost hang). The TIME_WRITE code is already correct
			 * for handling this timeout.
			 */

			if (sk->ip_xmit_timeout != TIME_WRITE) {
				if (sk->send_head)
					tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
				else if (sk->ip_xmit_timeout != TIME_PROBE0
				|| skb_queue_empty(&sk->write_queue)) {
					/* BUG check case.
					 * We have a problem here if there
					 * is no timer running [leads to
					 * frozen socket] or no data in the
					 * write queue [means we sent a fin
					 * and lost it from the queue before
					 * changing the ack properly].
					 */
					printk(KERN_ERR "Lost timer or fin packet in tcp_fin.\n");
				}
			}
			tcp_set_state(sk,TCP_CLOSING);
			break;
		case TCP_FIN_WAIT2:
			/*
			 * received a FIN -- send ACK and enter TIME_WAIT
			 */
			tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
			sk->shutdown|=SHUTDOWN_MASK;
			tcp_set_state(sk,TCP_TIME_WAIT);
			break;
		case TCP_CLOSE:
			/*
			 * already in CLOSE
			 */
			break;
		default:
			tcp_set_state(sk,TCP_LAST_ACK);
	
			/* Start the timers. */
			tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
			return(0);
	}

	return(0);
}

/*
 * Add a sk_buff to the TCP receive queue, calculating
 * the ACK sequence as we go..
 */
static inline void tcp_insert_skb(struct sk_buff * skb, struct sk_buff_head * list)
{
	struct sk_buff * prev, * next;
	u32 seq;

	/*
	 * Find where the new skb goes.. (This goes backwards,
	 * on the assumption that we get the packets in order)
	 */
	seq = skb->seq;
	prev = list->prev;
	next = (struct sk_buff *) list;
	for (;;) {
		if (prev == (struct sk_buff *) list || !after(prev->seq, seq))
			break;
		next = prev;
		prev = prev->prev;
	}
	__skb_insert(skb, prev, next, list);
}

/*
 * Called for each packet when we find a new ACK endpoint sequence in it
 */
static inline u32 tcp_queue_ack(struct sk_buff * skb, struct sock * sk)
{
	/*
	 *	When we ack the fin, we do the FIN 
	 *	processing.
	 */
	skb->acked = 1;
	if (skb->h.th->fin)
		tcp_fin(skb,sk,skb->h.th);
	return skb->end_seq;
}	

static void tcp_queue(struct sk_buff * skb, struct sock * sk, struct tcphdr *th)
{
	u32 ack_seq;

	tcp_insert_skb(skb, &sk->receive_queue);

	/*
	 * Did we get anything new to ack?
	 */
	ack_seq = sk->acked_seq;


	if (!after(skb->seq, ack_seq)) {
		if (after(skb->end_seq, ack_seq)) {
			/* the packet straddles our window end */
			struct sk_buff_head * list = &sk->receive_queue;
			struct sk_buff * next;
			ack_seq = tcp_queue_ack(skb, sk);

			/*
			 * Do we have any old packets to ack that the above
			 * made visible? (Go forward from skb)
			 */
			next = skb->next;
			while (next != (struct sk_buff *) list) {
				if (after(next->seq, ack_seq))
					break;
				if (after(next->end_seq, ack_seq))
					ack_seq = tcp_queue_ack(next, sk);
				next = next->next;
			}

			/*
			 * Ok, we found new data, update acked_seq as
			 * necessary (and possibly send the actual
			 * ACK packet).
			 */
			sk->acked_seq = ack_seq;

		} else {
			if (sk->debug)
				printk("Ack duplicate packet.\n");
			tcp_send_ack(sk);
			return;
		}


		/*
		 * Delay the ack if possible.  Send ack's to
		 * fin frames immediately as there shouldn't be
		 * anything more to come.
		 */
		if (!sk->delay_acks || th->fin) {
			tcp_send_ack(sk);
		} else {
			/*
			 * If psh is set we assume it's an
			 * interactive session that wants quick
			 * acks to avoid nagling too much. 
			 */
			int delay = HZ/2;
			if (th->psh)
				delay = HZ/50;
			tcp_send_delayed_ack(sk, delay, sk->ato);
		}

		/*
		 *	Tell the user we have some more data.
		 */

		if (!sk->dead)
			sk->data_ready(sk,0);

	}
	else
	{
	    /*
	     *	If we've missed a packet, send an ack.
	     *	Also start a timer to send another.
	     *
	     *	4.3reno machines look for these kind of acks so
	     *	they can do fast recovery. Three identical 'old'
	     *	acks lets it know that one frame has been lost
	     *      and should be resent. Because this is before the
	     *	whole window of data has timed out it can take
	     *	one lost frame per window without stalling.
	     *	[See Jacobson RFC1323, Stevens TCP/IP illus vol2]
	     *
	     *	We also should be spotting triple bad sequences.
	     *	[We now do this.]
	     *
	     */
	     
	    if (!skb->acked) 
	    {
		    if(sk->debug)
			    printk("Ack past end of seq packet.\n");
		    tcp_send_ack(sk);
		    /*
		     * We need to be very careful here. We must
		     * not violate Jacobsons packet conservation condition.
		     * This means we should only send an ACK when a packet
		     * leaves the network. We can say a packet left the
		     * network when we see a packet leave the network, or
		     * when an rto measure expires.
		     */
		    tcp_send_delayed_ack(sk,sk->rto,sk->rto);
	    }
	}
}


/*
 *	This routine handles the data.  If there is room in the buffer,
 *	it will be have already been moved into it.  If there is no
 *	room, then we will just have to discard the packet.
 */

static int tcp_data(struct sk_buff *skb, struct sock *sk, 
	 unsigned long saddr, unsigned int len)
{
	struct tcphdr *th;
	u32 new_seq, shut_seq;

	th = skb->h.th;
	skb_pull(skb,th->doff*4);
	skb_trim(skb,len-(th->doff*4));

	/*
	 *	The bytes in the receive read/assembly queue has increased. Needed for the
	 *	low memory discard algorithm 
	 */
	   
	sk->bytes_rcv += skb->len;
	
	if (skb->len == 0 && !th->fin) 
	{
		/* 
		 *	Don't want to keep passing ack's back and forth. 
		 *	(someone sent us dataless, boring frame)
		 */
		if (!th->ack)
			tcp_send_ack(sk);
		kfree_skb(skb, FREE_READ);
		return(0);
	}


	/*
	 *	We no longer have anyone receiving data on this connection.
	 */

#ifndef TCP_DONT_RST_SHUTDOWN		 

	if(sk->shutdown & RCV_SHUTDOWN)
	{
		/*
		 *	FIXME: BSD has some magic to avoid sending resets to
		 *	broken 4.2 BSD keepalives. Much to my surprise a few non
		 *	BSD stacks still have broken keepalives so we want to
		 *	cope with it.
		 */

		if(skb->len)	/* We don't care if it's just an ack or
				   a keepalive/window probe */
		{
			new_seq = skb->seq + skb->len + th->syn;	/* Right edge of _data_ part of frame */
			
			/* Do this the way 4.4BSD treats it. Not what I'd
			   regard as the meaning of the spec but it's what BSD
			   does and clearly they know everything 8) */

			/*
			 *	This is valid because of two things
			 *
			 *	a) The way tcp_data behaves at the bottom.
			 *	b) A fin takes effect when read not when received.
			 */
			 
			shut_seq = sk->acked_seq+1;	/* Last byte */
			
			if(after(new_seq,shut_seq))
			{
				if(sk->debug)
					printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
						sk, new_seq, shut_seq, sk->blog);
				if(sk->dead)
				{
					sk->acked_seq = new_seq + th->fin;
					tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
						sk->prot, NULL, skb->dev, 0, 255);
					tcp_statistics.TcpEstabResets++;
					sk->err = EPIPE;
					sk->error_report(sk);
					sk->shutdown = SHUTDOWN_MASK;
					tcp_set_state(sk,TCP_CLOSE);
					kfree_skb(skb, FREE_READ);
					return 0;
				}
			}
		}
	}

#endif

	/*
  	 * We should only call this if there is data in the frame.
 	 */
	tcp_delack_estimator(sk);

	tcp_queue(skb, sk, th);

	return(0);
}


/*
 *	This routine is only called when we have urgent data
 *	signalled. Its the 'slow' part of tcp_urg. It could be
 *	moved inline now as tcp_urg is only called from one
 *	place. We handle URGent data wrong. We have to - as
 *	BSD still doesn't use the correction from RFC961.
 *
 *	For 1003.1g we should support a new option TCP_STDURG to permit
 *	either form.
 */
 
static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
{
	u32 ptr = ntohs(th->urg_ptr);

	if (ptr)
		ptr--;
	ptr += ntohl(th->seq);

	/* ignore urgent data that we've already seen and read */
	if (after(sk->copied_seq, ptr))
		return;

	/* do we already have a newer (or duplicate) urgent pointer? */
	if (sk->urg_data && !after(ptr, sk->urg_seq))
		return;

	/* tell the world about our new urgent pointer */
	if (sk->proc != 0) {
		if (sk->proc > 0) {
			kill_proc(sk->proc, SIGURG, 1);
		} else {
			kill_pg(-sk->proc, SIGURG, 1);
		}
	}
	/*
	 *	We may be adding urgent data when the last byte read was
	 *	urgent. To do this requires some care. We cannot just ignore
	 *	sk->copied_seq since we would read the last urgent byte again
	 *	as data, nor can we alter copied_seq until this data arrives
	 *	or we break the sematics of SIOCATMARK (and thus sockatmark())
	 */
	if (sk->urg_seq == sk->copied_seq)
		sk->copied_seq++;	/* Move the copied sequence on correctly */
	sk->urg_data = URG_NOTYET;
	sk->urg_seq = ptr;
}

/*
 *	This is the 'fast' part of urgent handling.
 */
 
static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
{
	/*
	 *	Check if we get a new urgent pointer - normally not 
	 */
	 
	if (th->urg)
		tcp_check_urg(sk,th);

	/*
	 *	Do we wait for any urgent data? - normally not
	 */
	 
	if (sk->urg_data == URG_NOTYET) {
		u32 ptr;

		/*
		 *	Is the urgent pointer pointing into this packet? 
		 */	 
		ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
		if (ptr < len) {
			sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
			if (!sk->dead)
				sk->data_ready(sk,0);
		}
	}
}

/*
 * This should be a bit smarter and remove partially
 * overlapping stuff too, but this should be good
 * enough for any even remotely normal case (and the
 * worst that can happen is that we have a few
 * unnecessary packets in the receive queue).
 *
 * This function is never called with an empty list..
 */
static inline void tcp_remove_dups(struct sk_buff_head * list)
{
	struct sk_buff * next = list->next;

	for (;;) {
		struct sk_buff * skb = next;
		next = next->next;
		if (next == (struct sk_buff *) list)
			break;
		if (before(next->end_seq, skb->end_seq)) {
			__skb_unlink(next, list);
			kfree_skb(next, FREE_READ);
			next = skb;
			continue;
		}
		if (next->seq != skb->seq)
			continue;
		__skb_unlink(skb, list);
		kfree_skb(skb, FREE_READ);
	}
}

/*
 * Throw out all unnecessary packets: we've gone over the
 * receive queue limit. This shouldn't happen in a normal
 * TCP connection, but we might have gotten duplicates etc.
 */
static void prune_queue(struct sk_buff_head * list)
{
	for (;;) {
		struct sk_buff * skb = list->prev;

		/* gone through it all? */
		if (skb == (struct sk_buff *) list)
			break;
		if (!skb->acked) {
			__skb_unlink(skb, list);
			kfree_skb(skb, FREE_READ);
			continue;
		}
		tcp_remove_dups(list);
		break;
	}
}

#ifdef CONFIG_IP_TRANSPARENT_PROXY
/*
 *	Check whether a received TCP packet might be for one of our
 *	connections.
 */

int tcp_chkaddr(struct sk_buff *skb)
{
	struct iphdr *iph = skb->h.iph;
	struct tcphdr *th = (struct tcphdr *)(skb->h.raw + iph->ihl*4);
	struct sock *sk;

	sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, th->dest,
			   skb->dev);
	if (!sk)
		return 0;
	/* 0 means accept all LOCAL addresses here, not all the world... */
	if (sk->rcv_saddr == 0)
		return 0;
	return 1;
}
#endif

/*
 *	A TCP packet has arrived.
 *		skb->h.raw is the TCP header.
 */
 
int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
	__u32 daddr, unsigned short len,
	__u32 saddr, int redo, struct inet_protocol * protocol)
{
	struct tcphdr *th;
	struct sock *sk;
	__u32 seq;
#ifdef CONFIG_IP_TRANSPARENT_PROXY
	int r;
#endif

	/*
	 * "redo" is 1 if we have already seen this skb but couldn't
	 * use it at that time (the socket was locked).  In that case
	 * we have already done a lot of the work (looked up the socket
	 * etc).
	 */
	th = skb->h.th;
	sk = skb->sk;
#ifdef CONFIG_RST_COOKIES
	if (th->rst && secure_tcp_probe_number(saddr,daddr,ntohs(th->source),ntohs(th->dest),ntohl(th->seq),1)) {
		add_clearance(saddr);
	}
#endif
	if (!redo) {
		tcp_statistics.TcpInSegs++;
		if (skb->pkt_type!=PACKET_HOST)
			goto discard_it;

		/*
		 *	Pull up the IP header.
		 */
	
		skb_pull(skb, skb->h.raw-skb->data);

		/*
		 *	Try to use the device checksum if provided.
		 */
		switch (skb->ip_summed) 
		{
			case CHECKSUM_NONE:
				skb->csum = csum_partial((char *)th, len, 0);
			case CHECKSUM_HW:
				if (tcp_check(th, len, saddr, daddr, skb->csum))
					goto discard_it;
			default:
				/* CHECKSUM_UNNECESSARY */
		}
#ifdef CONFIG_SYN_COOKIES
retry_search:
#endif
#ifdef CONFIG_IP_TRANSPARENT_PROXY
		if (skb->redirport)
			sk = tcp_v4_proxy_lookup(saddr, th->source, daddr, th->dest, dev->pa_addr, skb->redirport, dev);
		else
#endif
		sk = __tcp_v4_lookup(th, saddr, th->source, daddr, th->dest, dev);
		if (!sk)
			goto no_tcp_socket;
		skb->sk = sk;
		skb->seq = ntohl(th->seq);
		skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
		skb->ack_seq = ntohl(th->ack_seq);

		skb->acked = 0;
		skb->used = 0;
		skb->free = 1;
		skb->saddr = daddr;
		skb->daddr = saddr;

		/*
		 * We may need to add it to the backlog here. 
		 */
		if (sk->users) 
		{
			__skb_queue_tail(&sk->back_log, skb);
			return(0);
		}
	}

	/*
	 *	If this socket has got a reset it's to all intents and purposes 
	 *	really dead. Count closed sockets as dead.
	 *
	 *	Note: BSD appears to have a bug here. A 'closed' TCP in BSD
	 *	simply drops data. This seems incorrect as a 'closed' TCP doesn't
	 *	exist so should cause resets as if the port was unreachable.
	 */

	if (sk->zapped || sk->state==TCP_CLOSE) {
		goto no_tcp_socket;
	}

	if (!sk->prot) 
	{
		printk(KERN_CRIT "IMPOSSIBLE 3\n");
		return(0);
	}


	/*
	 *	Charge the memory to the socket. 
	 */
	 
	skb->sk=sk;
	atomic_add(skb->truesize, &sk->rmem_alloc);

	/*
	 * Mark the time of the last received packet.
	 */
	sk->idletime = jiffies;
	
	/*
	 *	We should now do header prediction.
	 */
	 
	/*
	 *	This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
	 *	don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
	 *	compatibility. We also set up variables more thoroughly [Karn notes in the
	 *	KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
	 */

	if(sk->state!=TCP_ESTABLISHED)		/* Skip this lot for normal flow */
	{
	
		/*
		 *	Now deal with unusual cases.
		 */
	 
		if(sk->state==TCP_LISTEN)
		{
			/* Don't start connections with illegal address
			   ranges. Trying to talk TCP to a broken dhcp host
			   isnt good on a lan with broken SunOS 4.x boxes 
			   who think its a broadcast */
			   
			if ((saddr | daddr) == 0)
				goto discard_it;
				
			if (th->ack) {	/* These use the socket TOS.. might want to be the received TOS */
#ifdef CONFIG_SYN_COOKIES
				if (!th->syn && !th->rst) {
					__u32 acked_seq = ntohl(th->ack_seq)-1;
					int mtu_index = (acked_seq&0x7); /* extract MTU */
					__u32 count = jiffies/(60*HZ);

					acked_seq = acked_seq&0xfffffff8;

					/* Any time in the last 2 minutes is OK */
					if (acked_seq == secure_tcp_syn_cookie(daddr,
					    saddr,ntohs(th->source),ntohs(th->dest),
					    ntohl(th->seq)-1,count)
					|| acked_seq == secure_tcp_syn_cookie(daddr,
					    saddr,ntohs(th->source),ntohs(th->dest),
					    ntohl(th->seq)-1,count-1)
					|| acked_seq == secure_tcp_syn_cookie(daddr,
					    saddr,ntohs(th->source),ntohs(th->dest),
					    ntohl(th->seq)-1,count-2)) {
						/* If this passes, we need to fake up the
						* new socket in TCP_SYN_SENT state and
						* call ourselves recursively to handle
						* the move to ESTABLISHED using the
						* current packet. Nasty, but a cleaner
						* solution would require major rewrites.
						*/
						if (tcp_conn_request_fake(sk, skb, daddr, saddr, opt,
						     			  dev, (acked_seq | mtu_index), cookie_mtu[mtu_index])) {

							goto retry_search;
						}
					}
				}
#endif
				tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,0, 255);
			}

			/*
			 *	We don't care for RST, and non SYN are absorbed (old segments)
			 *	Broadcast/multicast SYN isn't allowed. Note - bug if you change the
			 *	netmask on a running connection it can go broadcast. Even Sun's have
			 *	this problem so I'm ignoring it 
			 */
			   
#ifdef CONFIG_IP_TRANSPARENT_PROXY
			/*
			 * We may get non-local addresses and still want to
			 * handle them locally, due to transparent proxying.
			 * Thus, narrow down the test to what is really meant.
			 */
			if(th->rst || !th->syn || th->ack || (r = ip_chk_addr(daddr)) == IS_BROADCAST || r == IS_MULTICAST)
#else
			if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
#endif
			{
				kfree_skb(skb, FREE_READ);
				return 0;
			}
		
			/*	
			 *	Guess we need to make a new socket up
			 */
			seq = secure_tcp_sequence_number(saddr, daddr,
							 skb->h.th->dest,
							 skb->h.th->source);
			tcp_conn_request(sk, skb, daddr, saddr, opt, dev, seq);
		
			/*
			 *	Now we have several options: In theory there is nothing else
			 *	in the frame. KA9Q has an option to send data with the syn,
			 *	BSD accepts data with the syn up to the [to be] advertised window
			 *	and Solaris 2.1 gives you a protocol error. For now we just ignore
			 *	it, that fits the spec precisely and avoids incompatibilities. It
			 *	would be nice in future to drop through and process the data.
			 *
			 *	Now TTCP is starting to use we ought to queue this data.
			 */
			 
			return 0;
		}
	
		/* 
		 *	Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
		 *	then it's a new connection
		 */
		 
		if (sk->state == TCP_SYN_RECV)
		{
			if(th->syn && skb->seq+1 == sk->acked_seq)
			{
				kfree_skb(skb, FREE_READ);
				return 0;
			}
			goto rfc_step4;
		}
		
		/*
		 *	SYN sent means we have to look for a suitable ack and either reset
		 *	for bad matches or go to connected. The SYN_SENT case is unusual and should
		 *	not be in line code. [AC]
		 */
	   
		if(sk->state==TCP_SYN_SENT)
		{
			/* Crossed SYN or previous junk segment */
			if(th->ack)
			{
				/* We got an ack, but it's not a good ack.
				 * We used to test this with a call to tcp_ack,
				 * but this loses, because it takes the SYN
				 * packet out of the send queue, even if
				 * the ACK doesn't have the SYN bit sent, and
				 * therefore isn't the one we are waiting for.
				 */
				if (after(skb->ack_seq, sk->sent_seq) || before(skb->ack_seq, sk->rcv_ack_seq))
				{
					/* Reset the ack - it's an ack from a 
					   different connection  [ th->rst is checked in tcp_send_reset()] */
					tcp_statistics.TcpAttemptFails++;
					tcp_send_reset(daddr, saddr, th,
						sk->prot, opt,dev,0,255);
					kfree_skb(skb, FREE_READ);
					return(0);
				}
				if(th->rst)
					return tcp_reset(sk,skb);
				if(!th->syn)
				{
					/* A valid ack from a different connection
					   start. Shouldn't happen but cover it */
	         			tcp_statistics.TcpAttemptFails++;
	                                tcp_send_reset(daddr, saddr, th,
	                                        sk->prot, opt,dev,0,255);
					kfree_skb(skb, FREE_READ);
					return 0;
				}

				/* process the ACK, get the SYN packet out
				 * of the send queue, do other initial
				 * processing stuff. [We know it's good, and
				 * we know it's the SYN,ACK we want.]
				 */
				tcp_ack(sk,th,skb->ack_seq,len);

				/* We must check here (before tcp_options) whether
				 * peer advertised a zero sized window on us, else
				 * we end up with a zero sk->{mtu,mss} and thus bomb
				 * out in tcp_do_sendmsg. -DaveM
				 */
				if(sk->max_window == 0)
					sk->max_window = 32;

				/*
				 *	Ok.. it's good. Set up sequence numbers and
				 *	move to established.
				 */
				sk->acked_seq = skb->seq+1;
				sk->lastwin_seq = skb->seq+1;
				sk->fin_seq = skb->seq;
				tcp_send_ack(sk);
				tcp_set_state(sk, TCP_ESTABLISHED);
				tcp_options(sk,th);

#if 0
				sk->dummy_th.dest=th->source;
				tcp_v4_rehash(sk);
#endif

				sk->copied_seq = sk->acked_seq;
				if(!sk->dead)
				{
					sk->state_change(sk);
					sock_wake_async(sk->socket, 0);
				}

				/* Reset the RTT estimator to the initial
				 * state rather than testing to avoid
				 * updating it on the ACK to the SYN packet.
				 */
				sk->rtt = 0;
				sk->rto = TCP_TIMEOUT_INIT;
				sk->mdev = TCP_TIMEOUT_INIT;
				goto rfc_step6;
			}
			else
			{
				/* See if SYN's cross. Drop if boring */
				if(th->syn && !th->rst)
				{
					/* Crossed SYN's are fine - but talking to
					   yourself is right out... */
					if(sk->saddr==saddr && sk->daddr==daddr &&
						sk->dummy_th.source==th->source &&
						sk->dummy_th.dest==th->dest)
					{
						tcp_statistics.TcpAttemptFails++;
						return tcp_reset(sk,skb);
					}
					tcp_set_state(sk,TCP_SYN_RECV);
					
					/*
					 *	FIXME:
					 *	Must send SYN|ACK here
					 */
				}		
				/* Discard junk segment */
				kfree_skb(skb, FREE_READ);
				return 0;
			}

			/*
			 *	Data maybe.. drop through
			 */
			 
		}

	/*
	 *	BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
	 *	a more complex suggestion for fixing these reuse issues in RFC1644
	 *	but not yet ready for general use. Also see RFC1379.
	 *
	 *	Note the funny way we go back to the top of this function for
	 *	this case ("goto try_next_socket").  That also takes care of
	 *	checking "sk->users" for the new socket as well as doing all
	 *	the normal tests on the packet.
	 */
	
#define BSD_TIME_WAIT
#ifdef BSD_TIME_WAIT
		if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
			after(skb->seq, sk->acked_seq) && !th->rst)
		{
			u32 seq = sk->write_seq;
			if(sk->debug)
				printk("Doing a BSD time wait\n");
			tcp_statistics.TcpEstabResets++;	   
			atomic_sub(skb->truesize, &sk->rmem_alloc);
			skb->sk = NULL;
			sk->err=ECONNRESET;
			tcp_set_state(sk, TCP_CLOSE);
			sk->shutdown = SHUTDOWN_MASK;
#ifdef CONFIG_IP_TRANSPARENT_PROXY
			/* What to do here?
			 * For the non-proxy case, this code is effectively almost a no-op,
			 * due to the sk = NULL.  Is that intentional?  If so, why shouldn't we
			 * do the same for the proxy case and get rid of some useless code?
			 */
			if (skb->redirport)
				sk = tcp_v4_proxy_lookup(saddr, th->source, daddr, th->dest,
							 dev->pa_addr, skb->redirport, dev);
			else
#endif
			sk = __tcp_v4_lookup(th, saddr, th->source, daddr, th->dest, dev);
			/* this is not really correct: we should check sk->users */
			if (sk && sk->state==TCP_LISTEN)
			{
				skb->sk = sk;
				atomic_add(skb->truesize, &sk->rmem_alloc);
				/* FIXME: Is the sequence number addition
				 * of 128000 here enough for fast networks?
				 * Also, does this reduce the security of
				 * our tcp sequence numbers?
			 	 */
				tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
				return 0;
			}
			kfree_skb(skb, FREE_READ);
			return 0;
		}
#endif	
	}

rfc_step4:		/* I'll clean this up later */

	/*
	 *	We are now in normal data flow (see the step list in the RFC)
	 *	Note most of these are inline now. I'll inline the lot when
	 *	I have time to test it hard and look at what gcc outputs 
	 */

	if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
	{
		bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev);
		kfree_skb(skb, FREE_READ);
		return 0;
	}

	if(th->rst)
		return tcp_reset(sk,skb);
	
	/*
	 *	Check for a SYN, and ensure it matches the SYN we were
	 *	first sent. We have to handle the rather unusual (but valid)
	 *	sequence that KA9Q derived products may generate of
	 *
	 *	SYN
	 *				SYN|ACK Data
	 *	ACK	(lost)
	 *				SYN|ACK Data + More Data
	 *	.. we must ACK not RST...
	 *
	 *	We keep syn_seq as the sequence space occupied by the 
	 *	original syn. 
	 */
	 
	if(th->syn && skb->seq!=sk->syn_seq)
	{
		tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev,0, 255);
		return tcp_reset(sk,skb);	
	}

	/*
	 *	Process the ACK
	 */
	 
	if(!th->ack)
	{
		kfree_skb(skb, FREE_WRITE);
		return 0;
	}

	if(!tcp_ack(sk,th,skb->ack_seq,len))
	{
		/*
		 *	Our three way handshake failed.
		 */
		 
		if(sk->state==TCP_SYN_RECV)
		{
			tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,0,255);
		}
		kfree_skb(skb, FREE_READ);
		return 0;
	}
	
rfc_step6:
	/*
	 *	If the accepted buffer put us over our queue size we
	 *	now drop it (we must process the ack first to avoid
	 *	deadlock cases).
	 */

	/*
	 *	Process urgent data
	 */
	 	
	tcp_urg(sk, th, len);
	
	/*
	 *	Process the encapsulated data
	 */
	
	if(tcp_data(skb,sk, saddr, len))
		kfree_skb(skb, FREE_READ);

	/*
	 *	If we had a partial packet being help up due to
	 *	application of Nagle's rule we are now free to send it.
	 */
	if (th->ack
	    && sk->packets_out == 0
	    && sk->partial != NULL
	    && skb_queue_empty(&sk->write_queue)
	    && sk->send_head == NULL) 
	{
		tcp_send_partial(sk);
	}

	/*
	 *	If our receive queue has grown past its limits,
	 *	try to prune away duplicates etc..
	 */
	if (sk->rmem_alloc > sk->rcvbuf)
		prune_queue(&sk->receive_queue);

	/*
	 *	And done
	 */	
	
	return 0;

no_tcp_socket:
	/*
	 * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
	 */
	tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,0,255);

discard_it:
	/*
	 *	Discard frame
	 */
	skb->sk = NULL;
	kfree_skb(skb, FREE_READ);
	return 0;
}
Elixir Cross Referencer

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.