drivers/net/tcpip/transport/tcp/tcpcore.c

   1 /*
   2  * COPYRIGHT:   See COPYING in the top level directory
   3  * PROJECT:     ReactOS TCP/IP protocol driver
   4  * FILE:        transport/tcp/tcpcore.c
   5  * PURPOSE:     Transmission Control Protocol
   6  * PROGRAMMERS: Casper S. Hornstrup (chorns@users.sourceforge.net)
   7  * REVISIONS:
   8  *   CSH 15-01-2003 Imported from linux kernel 2.4.20
   9  */
  10
  11 /*
  12  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  13  *              operating system.  INET is implemented using the  BSD Socket
  14  *              interface as the means of communication with the user level.
  15  *
  16  *              Implementation of the Transmission Control Protocol(TCP).
  17  *
  18  * Version:     $Id$
  19  *
  20  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  21  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  22  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  23  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  24  *              Florian La Roche, <flla@stud.uni-sb.de>
  25  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  26  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  27  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  28  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  29  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  30  *              Jorge Cwik, <jorge@laser.satlink.net>
  31  *
  32  * Fixes:
  33  *              Alan Cox        :       Numerous verify_area() calls
  34  *              Alan Cox        :       Set the ACK bit on a reset
  35  *              Alan Cox        :       Stopped it crashing if it closed while
  36  *                                      sk->inuse=1 and was trying to connect
  37  *                                      (tcp_err()).
  38  *              Alan Cox        :       All icmp error handling was broken
  39  *                                      pointers passed where wrong and the
  40  *                                      socket was looked up backwards. Nobody
  41  *                                      tested any icmp error code obviously.
  42  *              Alan Cox        :       tcp_err() now handled properly. It
  43  *                                      wakes people on errors. poll
  44  *                                      behaves and the icmp error race
  45  *                                      has gone by moving it into sock.c
  46  *              Alan Cox        :       tcp_send_reset() fixed to work for
  47  *                                      everything not just packets for
  48  *                                      unknown sockets.
  49  *              Alan Cox        :       tcp option processing.
  50  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  51  *                                      syn rule wrong]
  52  *              Herp Rosmanith  :       More reset fixes
  53  *              Alan Cox        :       No longer acks invalid rst frames.
  54  *                                      Acking any kind of RST is right out.
  55  *              Alan Cox        :       Sets an ignore me flag on an rst
  56  *                                      receive otherwise odd bits of prattle
  57  *                                      escape still
  58  *              Alan Cox        :       Fixed another acking RST frame bug.
  59  *                                      Should stop LAN workplace lockups.
  60  *              Alan Cox        :       Some tidyups using the new skb list
  61  *                                      facilities
  62  *              Alan Cox        :       sk->keepopen now seems to work
  63  *              Alan Cox        :       Pulls options out correctly on accepts
  64  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  65  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  66  *                                      bit to skb ops.
  67  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  68  *                                      nasty.
  69  *              Alan Cox        :       Added some better commenting, as the
  70  *                                      tcp is hard to follow
  71  *              Alan Cox        :       Removed incorrect check for 20 * psh
  72  *      Michael O'Reilly        :       ack < copied bug fix.
  73  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  74  *              Alan Cox        :       FIN with no memory -> CRASH
  75  *              Alan Cox        :       Added socket option proto entries.
  76  *                                      Also added awareness of them to accept.
  77  *              Alan Cox        :       Added TCP options (SOL_TCP)
  78  *              Alan Cox        :       Switched wakeup calls to callbacks,
  79  *                                      so the kernel can layer network
  80  *                                      sockets.
  81  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  82  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  83  *              Alan Cox        :       RST frames sent on unsynchronised
  84  *                                      state ack error.
  85  *              Alan Cox        :       Put in missing check for SYN bit.
  86  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  87  *                                      window non shrink trick.
  88  *              Alan Cox        :       Added a couple of small NET2E timer
  89  *                                      fixes
  90  *              Charles Hedrick :       TCP fixes
  91  *              Toomas Tamm     :       TCP window fixes
  92  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  93  *              Charles Hedrick :       Rewrote most of it to actually work
  94  *              Linus           :       Rewrote tcp_read() and URG handling
  95  *                                      completely
  96  *              Gerhard Koerting:       Fixed some missing timer handling
  97  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  98  *              Gerhard Koerting:       PC/TCP workarounds
  99  *              Adam Caldwell   :       Assorted timer/timing errors
 100  *              Matthew Dillon  :       Fixed another RST bug
 101  *              Alan Cox        :       Move to kernel side addressing changes.
 102  *              Alan Cox        :       Beginning work on TCP fastpathing
 103  *                                      (not yet usable)
 104  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
 105  *              Alan Cox        :       TCP fast path debugging
 106  *              Alan Cox        :       Window clamping
 107  *              Michael Riepe   :       Bug in tcp_check()
 108  *              Matt Dillon     :       More TCP improvements and RST bug fixes
 109  *              Matt Dillon     :       Yet more small nasties remove from the
 110  *                                      TCP code (Be very nice to this man if
 111  *                                      tcp finally works 100%) 8)
 112  *              Alan Cox        :       BSD accept semantics.
 113  *              Alan Cox        :       Reset on closedown bug.
 114  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 115  *              Michael Pall    :       Handle poll() after URG properly in
 116  *                                      all cases.
 117  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 118  *                                      (multi URG PUSH broke rlogin).
 119  *              Michael Pall    :       Fix the multi URG PUSH problem in
 120  *                                      tcp_readable(), poll() after URG
 121  *                                      works now.
 122  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 123  *                                      BSD api.
 124  *              Alan Cox        :       Changed the semantics of sk->socket to
 125  *                                      fix a race and a signal problem with
 126  *                                      accept() and async I/O.
 127  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 128  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 129  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 130  *                                      clients/servers which listen in on
 131  *                                      fixed ports.
 132  *              Alan Cox        :       Cleaned the above up and shrank it to
 133  *                                      a sensible code size.
 134  *              Alan Cox        :       Self connect lockup fix.
 135  *              Alan Cox        :       No connect to multicast.
 136  *              Ross Biro       :       Close unaccepted children on master
 137  *                                      socket close.
 138  *              Alan Cox        :       Reset tracing code.
 139  *              Alan Cox        :       Spurious resets on shutdown.
 140  *              Alan Cox        :       Giant 15 minute/60 second timer error
 141  *              Alan Cox        :       Small whoops in polling before an
 142  *                                      accept.
 143  *              Alan Cox        :       Kept the state trace facility since
 144  *                                      it's handy for debugging.
 145  *              Alan Cox        :       More reset handler fixes.
 146  *              Alan Cox        :       Started rewriting the code based on
 147  *                                      the RFC's for other useful protocol
 148  *                                      references see: Comer, KA9Q NOS, and
 149  *                                      for a reference on the difference
 150  *                                      between specifications and how BSD
 151  *                                      works see the 4.4lite source.
 152  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 153  *                                      close.
 154  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 155  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 156  *              Alan Cox        :       Reimplemented timers as per the RFC
 157  *                                      and using multiple timers for sanity.
 158  *              Alan Cox        :       Small bug fixes, and a lot of new
 159  *                                      comments.
 160  *              Alan Cox        :       Fixed dual reader crash by locking
 161  *                                      the buffers (much like datagram.c)
 162  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 163  *                                      now gets fed up of retrying without
 164  *                                      (even a no space) answer.
 165  *              Alan Cox        :       Extracted closing code better
 166  *              Alan Cox        :       Fixed the closing state machine to
 167  *                                      resemble the RFC.
 168  *              Alan Cox        :       More 'per spec' fixes.
 169  *              Jorge Cwik      :       Even faster checksumming.
 170  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 171  *                                      only frames. At least one pc tcp stack
 172  *                                      generates them.
 173  *              Alan Cox        :       Cache last socket.
 174  *              Alan Cox        :       Per route irtt.
 175  *              Matt Day        :       poll()->select() match BSD precisely on error
 176  *              Alan Cox        :       New buffers
 177  *              Marc Tamsky     :       Various sk->prot->retransmits and
 178  *                                      sk->retransmits misupdating fixed.
 179  *                                      Fixed tcp_write_timeout: stuck close,
 180  *                                      and TCP syn retries gets used now.
 181  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 182  *                                      ack if state is TCP_CLOSED.
 183  *              Alan Cox        :       Look up device on a retransmit - routes may
 184  *                                      change. Doesn't yet cope with MSS shrink right
 185  *                                      but its a start!
 186  *              Marc Tamsky     :       Closing in closing fixes.
 187  *              Mike Shaver     :       RFC1122 verifications.
 188  *              Alan Cox        :       rcv_saddr errors.
 189  *              Alan Cox        :       Block double connect().
 190  *              Alan Cox        :       Small hooks for enSKIP.
 191  *              Alexey Kuznetsov:       Path MTU discovery.
 192  *              Alan Cox        :       Support soft errors.
 193  *              Alan Cox        :       Fix MTU discovery pathological case
 194  *                                      when the remote claims no mtu!
 195  *              Marc Tamsky     :       TCP_CLOSE fix.
 196  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 197  *                                      window but wrong (fixes NT lpd problems)
 198  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 199  *              Joerg Reuter    :       No modification of locked buffers in
 200  *                                      tcp_do_retransmit()
 201  *              Eric Schenk     :       Changed receiver side silly window
 202  *                                      avoidance algorithm to BSD style
 203  *                                      algorithm. This doubles throughput
 204  *                                      against machines running Solaris,
 205  *                                      and seems to result in general
 206  *                                      improvement.
 207  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 208  *      Willy Konynenberg       :       Transparent proxying support.
 209  *      Mike McLagan            :       Routing by source
 210  *              Keith Owens     :       Do proper merging with partial SKB's in
 211  *                                      tcp_do_sendmsg to avoid burstiness.
 212  *              Eric Schenk     :       Fix fast close down bug with
 213  *                                      shutdown() followed by close().
 214  *              Andi Kleen      :       Make poll agree with SIGIO
 215  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 216  *                                      lingertime == 0 (RFC 793 ABORT Call)
 217  *
 218  *              This program is free software; you can redistribute it and/or
 219  *              modify it under the terms of the GNU General Public License
 220  *              as published by the Free Software Foundation; either version
 221  *              2 of the License, or(at your option) any later version.
 222  *
 223  * Description of States:
 224  *
 225  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 226  *
 227  *      TCP_SYN_RECV            received a connection request, sent ack,
 228  *                              waiting for final ack in three-way handshake.
 229  *
 230  *      TCP_ESTABLISHED         connection established
 231  *
 232  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 233  *                              transmission of remaining buffered data
 234  *
 235  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 236  *                              to shutdown
 237  *
 238  *      TCP_CLOSING             both sides have shutdown but we still have
 239  *                              data we have to finish sending
 240  *
 241  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 242  *                              closed, can only be entered from FIN_WAIT2
 243  *                              or CLOSING.  Required because the other end
 244  *                              may not have gotten our last ACK causing it
 245  *                              to retransmit the data packet (which we ignore)
 246  *
 247  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 248  *                              us to finish writing our data and to shutdown
 249  *                              (we have to close() to move on to LAST_ACK)
 250  *
 251  *      TCP_LAST_ACK            out side has shutdown after remote has
 252  *                              shutdown.  There may still be data in our
 253  *                              buffer that we have to finish sending
 254  *
 255  *      TCP_CLOSE               socket is finished
 256  */
 257
 258 #if 0
 259 #include <linux/config.h>
 260 #include <linux/types.h>
 261 #include <linux/fcntl.h>
 262 #include <linux/poll.h>
 263 #include <linux/init.h>
 264 #include <linux/smp_lock.h>
 265 #include <linux/fs.h>
 266
 267 #include <net/icmp.h>
 268 #include <net/tcp.h>
 269
 270 #include <asm/uaccess.h>
 271 #include <asm/ioctls.h>
 272 #else
 273 #include "linux.h"
 274 #include "tcpcore.h"
 275 #endif
 276
 277 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 278
 279 #ifdef ROS_STATISTICS
 280 struct tcp_mib  tcp_statistics[NR_CPUS*2];
 281 #endif
 282
 283 kmem_cache_t *tcp_openreq_cachep;
 284 kmem_cache_t *tcp_bucket_cachep;
 285 kmem_cache_t *tcp_timewait_cachep;
 286
 287 #if 0
 288 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 289 #endif
 290
 291 int sysctl_tcp_mem[3];
 292 int sysctl_tcp_wmem[3] = { 4*1024, 16*1024, 128*1024 };
 293 int sysctl_tcp_rmem[3] = { 4*1024, 87380, 87380*2 };
 294
 295 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 296 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 297
 298 /* Pressure flag: try to collapse.
 299  * Technical note: it is used by multiple contexts non atomically.
 300  * All the tcp_mem_schedule() is of this nature: accounting
 301  * is strict, actions are advisory and have some latency. */
 302 int tcp_memory_pressure;
 303
 304 #define TCP_PAGES(amt) (((amt)+TCP_MEM_QUANTUM-1)/TCP_MEM_QUANTUM)
 305
 306 int tcp_mem_schedule(struct sock *sk, int size, int kind)
 307 {
 308         int amt = TCP_PAGES(size);
 309
 310         sk->forward_alloc += amt*TCP_MEM_QUANTUM;
 311         atomic_add(amt, &tcp_memory_allocated);
 312
 313         /* Under limit. */
 314         if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
 315                 if (tcp_memory_pressure)
 316                         tcp_memory_pressure = 0;
 317                 return 1;
 318         }
 319
 320         /* Over hard limit. */
 321         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
 322                 tcp_enter_memory_pressure();
 323                 goto suppress_allocation;
 324         }
 325
 326         /* Under pressure. */
 327         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
 328                 tcp_enter_memory_pressure();
 329
 330         if (kind) {
 331                 if (atomic_read(&sk->rmem_alloc) < sysctl_tcp_rmem[0])
 332                         return 1;
 333         } else {
 334                 if (sk->wmem_queued < sysctl_tcp_wmem[0])
 335                         return 1;
 336         }
 337
 338         if (!tcp_memory_pressure ||
 339             sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated)
 340             * TCP_PAGES(sk->wmem_queued+atomic_read(&sk->rmem_alloc)+
 341                         sk->forward_alloc))
 342                 return 1;
 343
 344 suppress_allocation:
 345
 346         if (kind == 0) {
 347                 tcp_moderate_sndbuf(sk);
 348
 349                 /* Fail only if socket is _under_ its sndbuf.
 350                  * In this case we cannot block, so that we have to fail.
 351                  */
 352                 if (sk->wmem_queued+size >= sk->sndbuf)
 353                         return 1;
 354         }
 355
 356         /* Alas. Undo changes. */
 357         sk->forward_alloc -= amt*TCP_MEM_QUANTUM;
 358         atomic_sub(amt, &tcp_memory_allocated);
 359         return 0;
 360 }
 361
 362 void __tcp_mem_reclaim(struct sock *sk)
 363 {
 364         if (sk->forward_alloc >= TCP_MEM_QUANTUM) {
 365                 atomic_sub(sk->forward_alloc/TCP_MEM_QUANTUM, &tcp_memory_allocated);
 366                 sk->forward_alloc &= (TCP_MEM_QUANTUM-1);
 367                 if (tcp_memory_pressure &&
 368                     atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
 369                         tcp_memory_pressure = 0;
 370         }
 371 }
 372
 373 void tcp_rfree(struct sk_buff *skb)
 374 {
 375         struct sock *sk = skb->sk;
 376
 377         atomic_sub(skb->truesize, &sk->rmem_alloc);
 378         sk->forward_alloc += skb->truesize;
 379 }
 380
 381 /*
 382  * LISTEN is a special case for poll..
 383  */
 384 static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
 385 {
 386         return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
 387 }
 388
 389 /*
 390  *      Wait for a TCP event.
 391  *
 392  *      Note that we don't need to lock the socket, as the upper poll layers
 393  *      take care of normal races (between the test and the event) and we don't
 394  *      go look at any of the socket buffers directly.
 395  */
 396 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 397 {
 398 #if 0
 399         unsigned int mask;
 400         struct sock *sk = sock->sk;
 401         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 402
 403         poll_wait(file, sk->sleep, wait);
 404         if (sk->state == TCP_LISTEN)
 405                 return tcp_listen_poll(sk, wait);
 406
 407         /* Socket is not locked. We are protected from async events
 408            by poll logic and correct handling of state changes
 409            made by another threads is impossible in any case.
 410          */
 411
 412         mask = 0;
 413         if (sk->err)
 414                 mask = POLLERR;
 415
 416         /*
 417          * POLLHUP is certainly not done right. But poll() doesn't
 418          * have a notion of HUP in just one direction, and for a
 419          * socket the read side is more interesting.
 420          *
 421          * Some poll() documentation says that POLLHUP is incompatible
 422          * with the POLLOUT/POLLWR flags, so somebody should check this
 423          * all. But careful, it tends to be safer to return too many
 424          * bits than too few, and you can easily break real applications
 425          * if you don't tell them that something has hung up!
 426          *
 427          * Check-me.
 428          *
 429          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 430          * our fs/select.c). It means that after we received EOF,
 431          * poll always returns immediately, making impossible poll() on write()
 432          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 433          * if and only if shutdown has been made in both directions.
 434          * Actually, it is interesting to look how Solaris and DUX
 435          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 436          * then we could set it on SND_SHUTDOWN. BTW examples given
 437          * in Stevens' books assume exactly this behaviour, it explains
 438          * why PULLHUP is incompatible with POLLOUT.    --ANK
 439          *
 440          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 441          * blocking on fresh not-connected or disconnected socket. --ANK
 442          */
 443         if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
 444                 mask |= POLLHUP;
 445         if (sk->shutdown & RCV_SHUTDOWN)
 446                 mask |= POLLIN | POLLRDNORM;
 447
 448         /* Connected? */
 449         if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
 450                 /* Potential race condition. If read of tp below will
 451                  * escape above sk->state, we can be illegally awaken
 452                  * in SYN_* states. */
 453                 if ((tp->rcv_nxt != tp->copied_seq) &&
 454                     (tp->urg_seq != tp->copied_seq ||
 455                      tp->rcv_nxt != tp->copied_seq+1 ||
 456                      sk->urginline || !tp->urg_data))
 457                         mask |= POLLIN | POLLRDNORM;
 458
 459                 if (!(sk->shutdown & SEND_SHUTDOWN)) {
 460                         if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
 461                                 mask |= POLLOUT | POLLWRNORM;
 462                         } else {  /* send SIGIO later */
 463                                 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
 464                                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
 465
 466                                 /* Race breaker. If space is freed after
 467                                  * wspace test but before the flags are set,
 468                                  * IO signal will be lost.
 469                                  */
 470                                 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
 471                                         mask |= POLLOUT | POLLWRNORM;
 472                         }
 473                 }
 474
 475                 if (tp->urg_data & TCP_URG_VALID)
 476                         mask |= POLLPRI;
 477         }
 478         return mask;
 479 #else
 480   return 0;
 481 #endif
 482 }
 483
 484 /*
 485  *      TCP socket write_space callback.
 486  */
 487 void tcp_write_space(struct sock *sk)
 488 {
 489 #if 0
 490         struct socket *sock = sk->socket;
 491
 492         if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
 493                 clear_bit(SOCK_NOSPACE, &sock->flags);
 494
 495                 if (sk->sleep && waitqueue_active(sk->sleep))
 496                         wake_up_interruptible(sk->sleep);
 497
 498                 if (sock->fasync_list && !(sk->shutdown&SEND_SHUTDOWN))
 499                         sock_wake_async(sock, 2, POLL_OUT);
 500         }
 501 #endif
 502 }
 503
 504 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 505 {
 506 #if 0
 507         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 508         int answ;
 509
 510         switch(cmd) {
 511         case SIOCINQ:
 512                 if (sk->state == TCP_LISTEN)
 513                         return(-EINVAL);
 514
 515                 lock_sock(sk);
 516                 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
 517                         answ = 0;
 518                 else if (sk->urginline || !tp->urg_data ||
 519                          before(tp->urg_seq,tp->copied_seq) ||
 520                          !before(tp->urg_seq,tp->rcv_nxt)) {
 521                         answ = tp->rcv_nxt - tp->copied_seq;
 522
 523                         /* Subtract 1, if FIN is in queue. */
 524                         if (answ && !skb_queue_empty(&sk->receive_queue))
 525                                 answ -= ((struct sk_buff*)sk->receive_queue.prev)->h.th->fin;
 526                 } else
 527                         answ = tp->urg_seq - tp->copied_seq;
 528                 release_sock(sk);
 529                 break;
 530         case SIOCATMARK:
 531                 {
 532                         answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 533                         break;
 534                 }
 535         case SIOCOUTQ:
 536                 if (sk->state == TCP_LISTEN)
 537                         return(-EINVAL);
 538
 539                 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
 540                         answ = 0;
 541                 else
 542                         answ = tp->write_seq - tp->snd_una;
 543                 break;
 544         default:
 545                 return(-ENOIOCTLCMD);
 546         };
 547
 548         return put_user(answ, (int *)arg);
 549 #else
 550 return 0;
 551 #endif
 552 }
 553
 554
 555 int tcp_listen_start(struct sock *sk)
 556 {
 557 #if 0
 558         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 559         struct tcp_listen_opt *lopt;
 560
 561         sk->max_ack_backlog = 0;
 562         sk->ack_backlog = 0;
 563         tp->accept_queue = tp->accept_queue_tail = NULL;
 564         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 565         tcp_delack_init(tp);
 566
 567         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
 568         if (!lopt)
 569                 return -ENOMEM;
 570
 571         memset(lopt, 0, sizeof(struct tcp_listen_opt));
 572         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 573                 if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 574                         break;
 575
 576         write_lock_bh(&tp->syn_wait_lock);
 577         tp->listen_opt = lopt;
 578         write_unlock_bh(&tp->syn_wait_lock);
 579
 580         /* There is race window here: we announce ourselves listening,
 581          * but this transition is still not validated by get_port().
 582          * It is OK, because this socket enters to hash table only
 583          * after validation is complete.
 584          */
 585         sk->state = TCP_LISTEN;
 586         if (sk->prot->get_port(sk, sk->num) == 0) {
 587                 sk->sport = htons(sk->num);
 588
 589                 sk_dst_reset(sk);
 590                 sk->prot->hash(sk);
 591
 592                 return 0;
 593         }
 594
 595         sk->state = TCP_CLOSE;
 596         write_lock_bh(&tp->syn_wait_lock);
 597         tp->listen_opt = NULL;
 598         write_unlock_bh(&tp->syn_wait_lock);
 599         kfree(lopt);
 600         return -EADDRINUSE;
 601 #endif
 602 }
 603
 604 /*
 605  *      This routine closes sockets which have been at least partially
 606  *      opened, but not yet accepted.
 607  */
 608
 609 static void tcp_listen_stop (struct sock *sk)
 610 {
 611 #if 0
 612         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 613         struct tcp_listen_opt *lopt = tp->listen_opt;
 614         struct open_request *acc_req = tp->accept_queue;
 615         struct open_request *req;
 616         int i;
 617
 618         tcp_delete_keepalive_timer(sk);
 619
 620         /* make all the listen_opt local to us */
 621         write_lock_bh(&tp->syn_wait_lock);
 622         tp->listen_opt =NULL;
 623         write_unlock_bh(&tp->syn_wait_lock);
 624         tp->accept_queue = tp->accept_queue_tail = NULL;
 625
 626         if (lopt->qlen) {
 627                 for (i=0; i<TCP_SYNQ_HSIZE; i++) {
 628                         while ((req = lopt->syn_table[i]) != NULL) {
 629                                 lopt->syn_table[i] = req->dl_next;
 630                                 lopt->qlen--;
 631                                 tcp_openreq_free(req);
 632
 633                 /* Following specs, it would be better either to send FIN
 634                  * (and enter FIN-WAIT-1, it is normal close)
 635                  * or to send active reset (abort).
 636                  * Certainly, it is pretty dangerous while synflood, but it is
 637                  * bad justification for our negligence 8)
 638                  * To be honest, we are not able to make either
 639                  * of the variants now.                 --ANK
 640                  */
 641                         }
 642                 }
 643         }
 644         BUG_TRAP(lopt->qlen == 0);
 645
 646         kfree(lopt);
 647
 648         while ((req=acc_req) != NULL) {
 649                 struct sock *child = req->sk;
 650
 651                 acc_req = req->dl_next;
 652
 653                 local_bh_disable();
 654                 bh_lock_sock(child);
 655                 BUG_TRAP(child->lock.users==0);
 656                 sock_hold(child);
 657
 658                 tcp_disconnect(child, O_NONBLOCK);
 659
 660                 sock_orphan(child);
 661
 662                 atomic_inc(&tcp_orphan_count);
 663
 664                 tcp_destroy_sock(child);
 665
 666                 bh_unlock_sock(child);
 667                 local_bh_enable();
 668                 sock_put(child);
 669
 670                 tcp_acceptq_removed(sk);
 671                 tcp_openreq_fastfree(req);
 672         }
 673         BUG_TRAP(sk->ack_backlog == 0);
 674 #endif
 675 }
 676
 677 /*
 678  *      Wait for a socket to get into the connected state
 679  *
 680  *      Note: Must be called with the socket locked.
 681  */
 682 static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
 683 {
 684 #if 0
 685         struct task_struct *tsk = current;
 686         DECLARE_WAITQUEUE(wait, tsk);
 687
 688         while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 689                 if(sk->err)
 690                         return sock_error(sk);
 691                 if((1 << sk->state) &
 692                    ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
 693                         return -EPIPE;
 694                 if(!*timeo_p)
 695                         return -EAGAIN;
 696                 if(signal_pending(tsk))
 697                         return sock_intr_errno(*timeo_p);
 698
 699                 __set_task_state(tsk, TASK_INTERRUPTIBLE);
 700                 add_wait_queue(sk->sleep, &wait);
 701                 sk->tp_pinfo.af_tcp.write_pending++;
 702
 703                 release_sock(sk);
 704                 *timeo_p = schedule_timeout(*timeo_p);
 705                 lock_sock(sk);
 706
 707                 __set_task_state(tsk, TASK_RUNNING);
 708                 remove_wait_queue(sk->sleep, &wait);
 709                 sk->tp_pinfo.af_tcp.write_pending--;
 710         }
 711         return 0;
 712 #else
 713   return 0;
 714 #endif
 715 }
 716
 717 static inline int tcp_memory_free(struct sock *sk)
 718 {
 719         return sk->wmem_queued < sk->sndbuf;
 720 }
 721
 722 /*
 723  *      Wait for more memory for a socket
 724  */
 725 static int wait_for_tcp_memory(struct sock * sk, long *timeo)
 726 {
 727 #if 0
 728         int err = 0;
 729         long vm_wait = 0;
 730         long current_timeo = *timeo;
 731         DECLARE_WAITQUEUE(wait, current);
 732
 733         if (tcp_memory_free(sk))
 734                 current_timeo = vm_wait = (net_random()%(HZ/5))+2;
 735
 736         add_wait_queue(sk->sleep, &wait);
 737         for (;;) {
 738                 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
 739
 740                 set_current_state(TASK_INTERRUPTIBLE);
 741
 742                 if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
 743                         goto do_error;
 744                 if (!*timeo)
 745                         goto do_nonblock;
 746                 if (signal_pending(current))
 747                         goto do_interrupted;
 748                 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
 749                 if (tcp_memory_free(sk) && !vm_wait)
 750                         break;
 751
 752                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
 753                 sk->tp_pinfo.af_tcp.write_pending++;
 754                 release_sock(sk);
 755                 if (!tcp_memory_free(sk) || vm_wait)
 756                         current_timeo = schedule_timeout(current_timeo);
 757                 lock_sock(sk);
 758                 sk->tp_pinfo.af_tcp.write_pending--;
 759
 760                 if (vm_wait) {
 761                         vm_wait -= current_timeo;
 762                         current_timeo = *timeo;
 763                         if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
 764                             (current_timeo -= vm_wait) < 0)
 765                                 current_timeo = 0;
 766                         vm_wait = 0;
 767                 }
 768                 *timeo = current_timeo;
 769         }
 770 out:
 771         current->state = TASK_RUNNING;
 772         remove_wait_queue(sk->sleep, &wait);
 773         return err;
 774
 775 do_error:
 776         err = -EPIPE;
 777         goto out;
 778 do_nonblock:
 779         err = -EAGAIN;
 780         goto out;
 781 do_interrupted:
 782         err = sock_intr_errno(*timeo);
 783         goto out;
 784 #endif
 785 }
 786
 787 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
 788
 789 static inline int
 790 can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
 791 {
 792 #if 0
 793         if (i) {
 794                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 795                 return page == frag->page &&
 796                         off == frag->page_offset+frag->size;
 797         }
 798         return 0;
 799 #else
 800 return 0;
 801 #endif
 802 }
 803
 804 static inline void
 805 fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size)
 806 {
 807         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 808         frag->page = page;
 809         frag->page_offset = off;
 810         frag->size = size;
 811         skb_shinfo(skb)->nr_frags = i+1;
 812 }
 813
 814 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
 815 {
 816         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 817         tp->pushed_seq = tp->write_seq;
 818 }
 819
 820 static inline int forced_push(struct tcp_opt *tp)
 821 {
 822         return after(tp->write_seq, tp->pushed_seq + (tp->max_window>>1));
 823 }
 824
 825 static inline void
 826 skb_entail(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
 827 {
 828         skb->csum = 0;
 829         TCP_SKB_CB(skb)->seq = tp->write_seq;
 830         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 831         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 832         TCP_SKB_CB(skb)->sacked = 0;
 833         __skb_queue_tail(&sk->write_queue, skb);
 834         tcp_charge_skb(sk, skb);
 835         if (tp->send_head == NULL)
 836                 tp->send_head = skb;
 837 }
 838
 839 static inline void
 840 tcp_mark_urg(struct tcp_opt *tp, int flags, struct sk_buff *skb)
 841 {
 842 #if 0
 843         if (flags & MSG_OOB) {
 844                 tp->urg_mode = 1;
 845                 tp->snd_up = tp->write_seq;
 846                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 847         }
 848 #endif
 849 }
 850
 851 static inline void
 852 tcp_push(struct sock *sk, struct tcp_opt *tp, int flags, int mss_now, int nonagle)
 853 {
 854 #if 0
 855         if (tp->send_head) {
 856                 struct sk_buff *skb = sk->write_queue.prev;
 857                 if (!(flags&MSG_MORE) || forced_push(tp))
 858                         tcp_mark_push(tp, skb);
 859                 tcp_mark_urg(tp, flags, skb);
 860                 __tcp_push_pending_frames(sk, tp, mss_now, (flags&MSG_MORE) ? 2 : nonagle);
 861         }
 862 #endif
 863 }
 864
 865 static int tcp_error(struct sock *sk, int flags, int err)
 866 {
 867 #if 0
 868         if (err == -EPIPE)
 869                 err = sock_error(sk) ? : -EPIPE;
 870         if (err == -EPIPE && !(flags&MSG_NOSIGNAL))
 871                 send_sig(SIGPIPE, current, 0);
 872         return err;
 873 #else
 874   return 0;
 875 #endif
 876 }
 877
 878 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
 879 {
 880 #if 0
 881         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 882         int mss_now;
 883         int err;
 884         ssize_t copied;
 885         long timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
 886
 887         /* Wait for a connection to finish. */
 888         if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 889                 if((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
 890                         goto out_err;
 891
 892         clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
 893
 894         mss_now = tcp_current_mss(sk);
 895         copied = 0;
 896
 897         err = -EPIPE;
 898         if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
 899                 goto do_error;
 900
 901         while (psize > 0) {
 902                 struct sk_buff *skb = sk->write_queue.prev;
 903                 int offset, size, copy, i;
 904                 struct page *page;
 905
 906                 page = pages[poffset/PAGE_SIZE];
 907                 offset = poffset % PAGE_SIZE;
 908                 size = min_t(size_t, psize, PAGE_SIZE-offset);
 909
 910                 if (tp->send_head==NULL || (copy = mss_now - skb->len) <= 0) {
 911 new_segment:
 912                         if (!tcp_memory_free(sk))
 913                                 goto wait_for_sndbuf;
 914
 915                         skb = tcp_alloc_pskb(sk, 0, tp->mss_cache, sk->allocation);
 916                         if (skb == NULL)
 917                                 goto wait_for_memory;
 918
 919                         skb_entail(sk, tp, skb);
 920                         copy = mss_now;
 921                 }
 922
 923                 if (copy > size)
 924                         copy = size;
 925
 926                 i = skb_shinfo(skb)->nr_frags;
 927                 if (can_coalesce(skb, i, page, offset)) {
 928                         skb_shinfo(skb)->frags[i-1].size += copy;
 929                 } else if (i < MAX_SKB_FRAGS) {
 930                         get_page(page);
 931                         fill_page_desc(skb, i, page, offset, copy);
 932                 } else {
 933                         tcp_mark_push(tp, skb);
 934                         goto new_segment;
 935                 }
 936
 937                 skb->len += copy;
 938                 skb->data_len += copy;
 939                 skb->ip_summed = CHECKSUM_HW;
 940                 tp->write_seq += copy;
 941                 TCP_SKB_CB(skb)->end_seq += copy;
 942
 943                 if (!copied)
 944                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 945
 946                 copied += copy;
 947                 poffset += copy;
 948                 if (!(psize -= copy))
 949                         goto out;
 950
 951                 if (skb->len != mss_now || (flags&MSG_OOB))
 952                         continue;
 953
 954                 if (forced_push(tp)) {
 955                         tcp_mark_push(tp, skb);
 956                         __tcp_push_pending_frames(sk, tp, mss_now, 1);
 957                 } else if (skb == tp->send_head)
 958                         tcp_push_one(sk, mss_now);
 959                 continue;
 960
 961 wait_for_sndbuf:
 962                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
 963 wait_for_memory:
 964                 if (copied)
 965                         tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
 966
 967                 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
 968                         goto do_error;
 969
 970                 mss_now = tcp_current_mss(sk);
 971         }
 972
 973 out:
 974         if (copied)
 975                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 976         return copied;
 977
 978 do_error:
 979         if (copied)
 980                 goto out;
 981 out_err:
 982         return tcp_error(sk, flags, err);
 983 #else
 984 return 0;
 985 #endif
 986 }
 987
 988 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
 989 {
 990 #if 0
 991         ssize_t res;
 992         struct sock *sk = sock->sk;
 993
 994 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
 995
 996         if (!(sk->route_caps & NETIF_F_SG) ||
 997             !(sk->route_caps & TCP_ZC_CSUM_FLAGS))
 998                 return sock_no_sendpage(sock, page, offset, size, flags);
 999
1000 #undef TCP_ZC_CSUM_FLAGS
1001
1002         lock_sock(sk);
1003         TCP_CHECK_TIMER(sk);
1004         res = do_tcp_sendpages(sk, &page, offset, size, flags);
1005         TCP_CHECK_TIMER(sk);
1006         release_sock(sk);
1007         return res;
1008 #else
1009   return 0;
1010 #endif
1011 }
1012
1013 #define TCP_PAGE(sk)    (sk->tp_pinfo.af_tcp.sndmsg_page)
1014 #define TCP_OFF(sk)     (sk->tp_pinfo.af_tcp.sndmsg_off)
1015
1016 static inline int
1017 tcp_copy_to_page(struct sock *sk, char *from, struct sk_buff *skb,
1018                  struct page *page, int off, int copy)
1019 {
1020         int err = 0;
1021         unsigned int csum;
1022
1023         csum = csum_and_copy_from_user(from, page_address(page)+off,
1024                                        copy, 0, &err);
1025         if (!err) {
1026                 if (skb->ip_summed == CHECKSUM_NONE)
1027                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1028                 skb->len += copy;
1029                 skb->data_len += copy;
1030                 skb->truesize += copy;
1031                 sk->wmem_queued += copy;
1032                 sk->forward_alloc -= copy;
1033         }
1034         return err;
1035 }
1036
1037 static inline int
1038 skb_add_data(struct sk_buff *skb, char *from, int copy)
1039 {
1040 #if 0
1041         int err = 0;
1042         unsigned int csum;
1043         int off = skb->len;
1044
1045         csum = csum_and_copy_from_user(from, skb_put(skb, copy),
1046                                        copy, 0, &err);
1047         if (!err) {
1048                 skb->csum = csum_block_add(skb->csum, csum, off);
1049                 return 0;
1050         }
1051
1052         __skb_trim(skb, off);
1053         return -EFAULT;
1054 #else
1055 return 0;
1056 #endif
1057 }
1058
1059 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1060 {
1061 #if 0
1062         int tmp = tp->mss_cache;
1063
1064         if (sk->route_caps&NETIF_F_SG) {
1065                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1066
1067                 if (tmp >= pgbreak && tmp <= pgbreak + (MAX_SKB_FRAGS-1)*PAGE_SIZE)
1068                         tmp = pgbreak;
1069         }
1070         return tmp;
1071 #else
1072   return 0;
1073 #endif
1074 }
1075
1076 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
1077 {
1078 #if 0
1079         struct iovec *iov;
1080         struct tcp_opt *tp;
1081         struct sk_buff *skb;
1082         int iovlen, flags;
1083         int mss_now;
1084         int err, copied;
1085         long timeo;
1086
1087         tp = &(sk->tp_pinfo.af_tcp);
1088
1089         lock_sock(sk);
1090         TCP_CHECK_TIMER(sk);
1091
1092         flags = msg->msg_flags;
1093         timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
1094
1095         /* Wait for a connection to finish. */
1096         if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1097                 if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1098                         goto out_err;
1099
1100         /* This should be in poll */
1101         clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
1102
1103         mss_now = tcp_current_mss(sk);
1104
1105         /* Ok commence sending. */
1106         iovlen = msg->msg_iovlen;
1107         iov = msg->msg_iov;
1108         copied = 0;
1109
1110         err = -EPIPE;
1111         if (sk->err || (sk->shutdown&SEND_SHUTDOWN))
1112                 goto do_error;
1113
1114         while (--iovlen >= 0) {
1115                 int seglen=iov->iov_len;
1116                 unsigned char * from=iov->iov_base;
1117
1118                 iov++;
1119
1120                 while (seglen > 0) {
1121                         int copy;
1122
1123                         skb = sk->write_queue.prev;
1124
1125                         if (tp->send_head == NULL ||
1126                             (copy = mss_now - skb->len) <= 0) {
1127
1128 new_segment:
1129                                 /* Allocate new segment. If the interface is SG,
1130                                  * allocate skb fitting to single page.
1131                                  */
1132                                 if (!tcp_memory_free(sk))
1133                                         goto wait_for_sndbuf;
1134
1135                                 skb = tcp_alloc_pskb(sk, select_size(sk, tp), 0, sk->allocation);
1136                                 if (skb == NULL)
1137                                         goto wait_for_memory;
1138
1139                                 skb_entail(sk, tp, skb);
1140                                 copy = mss_now;
1141                         }
1142
1143                         /* Try to append data to the end of skb. */
1144                         if (copy > seglen)
1145                                 copy = seglen;
1146
1147                         /* Where to copy to? */
1148                         if (skb_tailroom(skb) > 0) {
1149                                 /* We have some space in skb head. Superb! */
1150                                 if (copy > skb_tailroom(skb))
1151                                         copy = skb_tailroom(skb);
1152                                 if ((err = skb_add_data(skb, from, copy)) != 0)
1153                                         goto do_fault;
1154                         } else {
1155                                 int merge = 0;
1156                                 int i = skb_shinfo(skb)->nr_frags;
1157                                 struct page *page = TCP_PAGE(sk);
1158                                 int off = TCP_OFF(sk);
1159
1160                                 if (can_coalesce(skb, i, page, off) && off != PAGE_SIZE) {
1161                                         /* We can extend the last page fragment. */
1162                                         merge = 1;
1163                                 } else if (i == MAX_SKB_FRAGS ||
1164                                            (i == 0 && !(sk->route_caps&NETIF_F_SG))) {
1165                                         /* Need to add new fragment and cannot
1166                                          * do this because interface is non-SG,
1167                                          * or because all the page slots are busy.
1168                                          */
1169                                         tcp_mark_push(tp, skb);
1170                                         goto new_segment;
1171                                 } else if (page) {
1172                                         /* If page is cached, align
1173                                          * offset to L1 cache boundary
1174                                          */
1175                                         off = (off+L1_CACHE_BYTES-1)&~(L1_CACHE_BYTES-1);
1176                                         if (off == PAGE_SIZE) {
1177                                                 put_page(page);
1178                                                 TCP_PAGE(sk) = page = NULL;
1179                                         }
1180                                 }
1181
1182                                 if (!page) {
1183                                         /* Allocate new cache page. */
1184                                         if (!(page=tcp_alloc_page(sk)))
1185                                                 goto wait_for_memory;
1186                                         off = 0;
1187                                 }
1188
1189                                 if (copy > PAGE_SIZE-off)
1190                                         copy = PAGE_SIZE-off;
1191
1192                                 /* Time to copy data. We are close to the end! */
1193                                 err = tcp_copy_to_page(sk, from, skb, page, off, copy);
1194                                 if (err) {
1195                                         /* If this page was new, give it to the
1196                                          * socket so it does not get leaked.
1197                                          */
1198                                         if (TCP_PAGE(sk) == NULL) {
1199                                                 TCP_PAGE(sk) = page;
1200                                                 TCP_OFF(sk) = 0;
1201                                         }
1202                                         goto do_error;
1203                                 }
1204
1205                                 /* Update the skb. */
1206                                 if (merge) {
1207                                         skb_shinfo(skb)->frags[i-1].size += copy;
1208                                 } else {
1209                                         fill_page_desc(skb, i, page, off, copy);
1210                                         if (TCP_PAGE(sk)) {
1211                                                 get_page(page);
1212                                         } else if (off + copy < PAGE_SIZE) {
1213                                                 get_page(page);
1214                                                 TCP_PAGE(sk) = page;
1215                                         }
1216                                 }
1217
1218                                 TCP_OFF(sk) = off+copy;
1219                         }
1220
1221                         if (!copied)
1222                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1223
1224                         tp->write_seq += copy;
1225                         TCP_SKB_CB(skb)->end_seq += copy;
1226
1227                         from += copy;
1228                         copied += copy;
1229                         if ((seglen -= copy) == 0 && iovlen == 0)
1230                                 goto out;
1231
1232                         if (skb->len != mss_now || (flags&MSG_OOB))
1233                                 continue;
1234
1235                         if (forced_push(tp)) {
1236                                 tcp_mark_push(tp, skb);
1237                                 __tcp_push_pending_frames(sk, tp, mss_now, 1);
1238                         } else if (skb == tp->send_head)
1239                                 tcp_push_one(sk, mss_now);
1240                         continue;
1241
1242 wait_for_sndbuf:
1243                         set_bit(SOCK_NOSPACE, &sk->socket->flags);
1244 wait_for_memory:
1245                         if (copied)
1246                                 tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
1247
1248                         if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1249                                 goto do_error;
1250
1251                         mss_now = tcp_current_mss(sk);
1252                 }
1253         }
1254
1255 out:
1256         if (copied)
1257                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1258         TCP_CHECK_TIMER(sk);
1259         release_sock(sk);
1260         return copied;
1261
1262 do_fault:
1263         if (skb->len == 0) {
1264                 if (tp->send_head == skb)
1265                         tp->send_head = NULL;
1266                 __skb_unlink(skb, skb->list);
1267                 tcp_free_skb(sk, skb);
1268         }
1269
1270 do_error:
1271         if (copied)
1272                 goto out;
1273 out_err:
1274         err = tcp_error(sk, flags, err);
1275         TCP_CHECK_TIMER(sk);
1276         release_sock(sk);
1277         return err;
1278 #else
1279   return 0;
1280 #endif
1281 }
1282
1283 /*
1284  *      Handle reading urgent data. BSD has very simple semantics for
1285  *      this, no blocking and very strange errors 8)
1286  */
1287
1288 static int tcp_recv_urg(struct sock * sk, long timeo,
1289                         struct msghdr *msg, int len, int flags,
1290                         int *addr_len)
1291 {
1292 #if 0
1293         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1294
1295         /* No URG data to read. */
1296         if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
1297                 return -EINVAL; /* Yes this is right ! */
1298
1299         if (sk->state==TCP_CLOSE && !sk->done)
1300                 return -ENOTCONN;
1301
1302         if (tp->urg_data & TCP_URG_VALID) {
1303                 int err = 0;
1304                 char c = tp->urg_data;
1305
1306                 if (!(flags & MSG_PEEK))
1307                         tp->urg_data = TCP_URG_READ;
1308
1309                 /* Read urgent data. */
1310                 msg->msg_flags|=MSG_OOB;
1311
1312                 if(len>0) {
1313                         if (!(flags & MSG_TRUNC))
1314                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1315                         len = 1;
1316                 } else
1317                         msg->msg_flags|=MSG_TRUNC;
1318
1319                 return err ? -EFAULT : len;
1320         }
1321
1322         if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
1323                 return 0;
1324
1325         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1326          * the available implementations agree in this case:
1327          * this call should never block, independent of the
1328          * blocking state of the socket.
1329          * Mike <pall@rz.uni-karlsruhe.de>
1330          */
1331         return -EAGAIN;
1332 #else
1333 return 0;
1334 #endif
1335 }
1336
1337 /*
1338  *      Release a skb if it is no longer needed. This routine
1339  *      must be called with interrupts disabled or with the
1340  *      socket locked so that the sk_buff queue operation is ok.
1341  */
1342
1343 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1344 {
1345 #if 0
1346         __skb_unlink(skb, &sk->receive_queue);
1347         __kfree_skb(skb);
1348 #endif
1349 }
1350
1351 /* Clean up the receive buffer for full frames taken by the user,
1352  * then send an ACK if necessary.  COPIED is the number of bytes
1353  * tcp_recvmsg has given to the user so far, it speeds up the
1354  * calculation of whether or not we must ACK for the sake of
1355  * a window update.
1356  */
1357 static void cleanup_rbuf(struct sock *sk, int copied)
1358 {
1359 #if 0
1360         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1361         int time_to_ack = 0;
1362
1363 #if TCP_DEBUG
1364         struct sk_buff *skb = skb_peek(&sk->receive_queue);
1365
1366         BUG_TRAP(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1367 #endif
1368
1369         if (tcp_ack_scheduled(tp)) {
1370                    /* Delayed ACKs frequently hit locked sockets during bulk receive. */
1371                 if (tp->ack.blocked
1372                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1373                     || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss
1374                     /*
1375                      * If this read emptied read buffer, we send ACK, if
1376                      * connection is not bidirectional, user drained
1377                      * receive buffer and there was a small segment
1378                      * in queue.
1379                      */
1380                     || (copied > 0 &&
1381                         (tp->ack.pending&TCP_ACK_PUSHED) &&
1382                         !tp->ack.pingpong &&
1383                         atomic_read(&sk->rmem_alloc) == 0)) {
1384                         time_to_ack = 1;
1385                 }
1386         }
1387
1388         /* We send an ACK if we can now advertise a non-zero window
1389          * which has been raised "significantly".
1390          *
1391          * Even if window raised up to infinity, do not send window open ACK
1392          * in states, where we will not receive more. It is useless.
1393          */
1394         if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
1395                 __u32 rcv_window_now = tcp_receive_window(tp);
1396
1397                 /* Optimize, __tcp_select_window() is not cheap. */
1398                 if (2*rcv_window_now <= tp->window_clamp) {
1399                         __u32 new_window = __tcp_select_window(sk);
1400
1401                         /* Send ACK now, if this read freed lots of space
1402                          * in our buffer. Certainly, new_window is new window.
1403                          * We can advertise it now, if it is not less than current one.
1404                          * "Lots" means "at least twice" here.
1405                          */
1406                         if(new_window && new_window >= 2*rcv_window_now)
1407                                 time_to_ack = 1;
1408                 }
1409         }
1410         if (time_to_ack)
1411                 tcp_send_ack(sk);
1412 #endif
1413 }
1414
1415 /* Now socket state including sk->err is changed only under lock,
1416  * hence we may omit checks after joining wait queue.
1417  * We check receive queue before schedule() only as optimization;
1418  * it is very likely that release_sock() added new data.
1419  */
1420
1421 static long tcp_data_wait(struct sock *sk, long timeo)
1422 {
1423 #if 0
1424         DECLARE_WAITQUEUE(wait, current);
1425
1426         add_wait_queue(sk->sleep, &wait);
1427
1428         __set_current_state(TASK_INTERRUPTIBLE);
1429
1430         set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1431         release_sock(sk);
1432
1433         if (skb_queue_empty(&sk->receive_queue))
1434                 timeo = schedule_timeout(timeo);
1435
1436         lock_sock(sk);
1437         clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1438
1439         remove_wait_queue(sk->sleep, &wait);
1440         __set_current_state(TASK_RUNNING);
1441         return timeo;
1442 #else
1443   return 0;
1444 #endif
1445 }
1446
1447 static void tcp_prequeue_process(struct sock *sk)
1448 {
1449 #if 0
1450         struct sk_buff *skb;
1451         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1452
1453         net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
1454
1455         /* RX process wants to run with disabled BHs, though it is not necessary */
1456         local_bh_disable();
1457         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1458                 sk->backlog_rcv(sk, skb);
1459         local_bh_enable();
1460
1461         /* Clear memory counter. */
1462         tp->ucopy.memory = 0;
1463 #endif
1464 }
1465
1466 static inline
1467 struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1468 {
1469 #if 0
1470         struct sk_buff *skb;
1471         u32 offset;
1472
1473         skb_queue_walk(&sk->receive_queue, skb) {
1474                 offset = seq - TCP_SKB_CB(skb)->seq;
1475                 if (skb->h.th->syn)
1476                         offset--;
1477                 if (offset < skb->len || skb->h.th->fin) {
1478                         *off = offset;
1479                         return skb;
1480                 }
1481         }
1482         return NULL;
1483 #else
1484         return NULL;
1485 #endif
1486 }
1487
1488 /*
1489  * This routine provides an alternative to tcp_recvmsg() for routines
1490  * that would like to handle copying from skbuffs directly in 'sendfile'
1491  * fashion.
1492  * Note:
1493  *      - It is assumed that the socket was locked by the caller.
1494  *      - The routine does not block.
1495  *      - At present, there is no support for reading OOB data
1496  *        or for 'peeking' the socket using this routine
1497  *        (although both would be easy to implement).
1498  */
1499 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1500                   sk_read_actor_t recv_actor)
1501 {
1502 #if 0
1503         struct sk_buff *skb;
1504         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1505         u32 seq = tp->copied_seq;
1506         u32 offset;
1507         int copied = 0;
1508
1509         if (sk->state == TCP_LISTEN)
1510                 return -ENOTCONN;
1511         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1512                 if (offset < skb->len) {
1513                         size_t used, len;
1514
1515                         len = skb->len - offset;
1516                         /* Stop reading if we hit a patch of urgent data */
1517                         if (tp->urg_data) {
1518                                 u32 urg_offset = tp->urg_seq - seq;
1519                                 if (urg_offset < len)
1520                                         len = urg_offset;
1521                                 if (!len)
1522                                         break;
1523                         }
1524                         used = recv_actor(desc, skb, offset, len);
1525                         if (used <= len) {
1526                                 seq += used;
1527                                 copied += used;
1528                                 offset += used;
1529                         }
1530                         if (offset != skb->len)
1531                                 break;
1532                 }
1533                 if (skb->h.th->fin) {
1534                         tcp_eat_skb(sk, skb);
1535                         ++seq;
1536                         break;
1537                 }
1538                 tcp_eat_skb(sk, skb);
1539                 if (!desc->count)
1540                         break;
1541         }
1542         tp->copied_seq = seq;
1543         /* Clean up data we have read: This will do ACK frames. */
1544         if (copied)
1545                 cleanup_rbuf(sk, copied);
1546         return copied;
1547 #else
1548 #endif
1549 }
1550
1551 /*
1552  *      This routine copies from a sock struct into the user buffer.
1553  *
1554  *      Technical note: in 2.3 we work on _locked_ socket, so that
1555  *      tricks with *seq access order and skb->users are not required.
1556  *      Probably, code can be easily improved even more.
1557  */
1558
1559 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1560                 int len, int nonblock, int flags, int *addr_len)
1561 {
1562 #if 0
1563         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1564         int copied = 0;
1565         u32 peek_seq;
1566         u32 *seq;
1567         unsigned long used;
1568         int err;
1569         int target;             /* Read at least this many bytes */
1570         long timeo;
1571         struct task_struct *user_recv = NULL;
1572
1573         lock_sock(sk);
1574
1575         TCP_CHECK_TIMER(sk);
1576
1577         err = -ENOTCONN;
1578         if (sk->state == TCP_LISTEN)
1579                 goto out;
1580
1581         timeo = sock_rcvtimeo(sk, nonblock);
1582
1583         /* Urgent data needs to be handled specially. */
1584         if (flags & MSG_OOB)
1585                 goto recv_urg;
1586
1587         seq = &tp->copied_seq;
1588         if (flags & MSG_PEEK) {
1589                 peek_seq = tp->copied_seq;
1590                 seq = &peek_seq;
1591         }
1592
1593         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1594
1595         do {
1596                 struct sk_buff * skb;
1597                 u32 offset;
1598
1599                 /* Are we at urgent data? Stop if we have read anything. */
1600                 if (copied && tp->urg_data && tp->urg_seq == *seq)
1601                         break;
1602
1603                 /* We need to check signals first, to get correct SIGURG
1604                  * handling. FIXME: Need to check this doesn't impact 1003.1g
1605                  * and move it down to the bottom of the loop
1606                  */
1607                 if (signal_pending(current)) {
1608                         if (copied)
1609                                 break;
1610                         copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1611                         break;
1612                 }
1613
1614                 /* Next get a buffer. */
1615
1616                 skb = skb_peek(&sk->receive_queue);
1617                 do {
1618                         if (!skb)
1619                                 break;
1620
1621                         /* Now that we have two receive queues this
1622                          * shouldn't happen.
1623                          */
1624                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1625                                 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1626                                        *seq, TCP_SKB_CB(skb)->seq);
1627                                 break;
1628                         }
1629                         offset = *seq - TCP_SKB_CB(skb)->seq;
1630                         if (skb->h.th->syn)
1631                                 offset--;
1632                         if (offset < skb->len)
1633                                 goto found_ok_skb;
1634                         if (skb->h.th->fin)
1635                                 goto found_fin_ok;
1636                         BUG_TRAP(flags&MSG_PEEK);
1637                         skb = skb->next;
1638                 } while (skb != (struct sk_buff *)&sk->receive_queue);
1639
1640                 /* Well, if we have backlog, try to process it now yet. */
1641
1642                 if (copied >= target && sk->backlog.tail == NULL)
1643                         break;
1644
1645                 if (copied) {
1646                         if (sk->err ||
1647                             sk->state == TCP_CLOSE ||
1648                             (sk->shutdown & RCV_SHUTDOWN) ||
1649                             !timeo ||
1650                             (flags & MSG_PEEK))
1651                                 break;
1652                 } else {
1653                         if (sk->done)
1654                                 break;
1655
1656                         if (sk->err) {
1657                                 copied = sock_error(sk);
1658                                 break;
1659                         }
1660
1661                         if (sk->shutdown & RCV_SHUTDOWN)
1662                                 break;
1663
1664                         if (sk->state == TCP_CLOSE) {
1665                                 if (!sk->done) {
1666                                         /* This occurs when user tries to read
1667                                          * from never connected socket.
1668                                          */
1669                                         copied = -ENOTCONN;
1670                                         break;
1671                                 }
1672                                 break;
1673                         }
1674
1675                         if (!timeo) {
1676                                 copied = -EAGAIN;
1677                                 break;
1678                         }
1679                 }
1680
1681                 cleanup_rbuf(sk, copied);
1682
1683                 if (tp->ucopy.task == user_recv) {
1684                         /* Install new reader */
1685                         if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) {
1686                                 user_recv = current;
1687                                 tp->ucopy.task = user_recv;
1688                                 tp->ucopy.iov = msg->msg_iov;
1689                         }
1690
1691                         tp->ucopy.len = len;
1692
1693                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC)));
1694
1695                         /* Ugly... If prequeue is not empty, we have to
1696                          * process it before releasing socket, otherwise
1697                          * order will be broken at second iteration.
1698                          * More elegant solution is required!!!
1699                          *
1700                          * Look: we have the following (pseudo)queues:
1701                          *
1702                          * 1. packets in flight
1703                          * 2. backlog
1704                          * 3. prequeue
1705                          * 4. receive_queue
1706                          *
1707                          * Each queue can be processed only if the next ones
1708                          * are empty. At this point we have empty receive_queue.
1709                          * But prequeue _can_ be not empty after second iteration,
1710                          * when we jumped to start of loop because backlog
1711                          * processing added something to receive_queue.
1712                          * We cannot release_sock(), because backlog contains
1713                          * packets arrived _after_ prequeued ones.
1714                          *
1715                          * Shortly, algorithm is clear --- to process all
1716                          * the queues in order. We could make it more directly,
1717                          * requeueing packets from backlog to prequeue, if
1718                          * is not empty. It is more elegant, but eats cycles,
1719                          * unfortunately.
1720                          */
1721                         if (skb_queue_len(&tp->ucopy.prequeue))
1722                                 goto do_prequeue;
1723
1724                         /* __ Set realtime policy in scheduler __ */
1725                 }
1726
1727                 if (copied >= target) {
1728                         /* Do not sleep, just process backlog. */
1729                         release_sock(sk);
1730                         lock_sock(sk);
1731                 } else {
1732                         timeo = tcp_data_wait(sk, timeo);
1733                 }
1734
1735                 if (user_recv) {
1736                         int chunk;
1737
1738                         /* __ Restore normal policy in scheduler __ */
1739
1740                         if ((chunk = len - tp->ucopy.len) != 0) {
1741                                 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
1742                                 len -= chunk;
1743                                 copied += chunk;
1744                         }
1745
1746                         if (tp->rcv_nxt == tp->copied_seq &&
1747                             skb_queue_len(&tp->ucopy.prequeue)) {
1748 do_prequeue:
1749                                 tcp_prequeue_process(sk);
1750
1751                                 if ((chunk = len - tp->ucopy.len) != 0) {
1752                                         net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1753                                         len -= chunk;
1754                                         copied += chunk;
1755                                 }
1756                         }
1757                 }
1758                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1759                         if (net_ratelimit())
1760                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1761                                        current->comm, current->pid);
1762                         peek_seq = tp->copied_seq;
1763                 }
1764                 continue;
1765
1766         found_ok_skb:
1767                 /* Ok so how much can we use? */
1768                 used = skb->len - offset;
1769                 if (len < used)
1770                         used = len;
1771
1772                 /* Do we have urgent data here? */
1773                 if (tp->urg_data) {
1774                         u32 urg_offset = tp->urg_seq - *seq;
1775                         if (urg_offset < used) {
1776                                 if (!urg_offset) {
1777                                         if (!sk->urginline) {
1778                                                 ++*seq;
1779                                                 offset++;
1780                                                 used--;
1781                                                 if (!used)
1782                                                         goto skip_copy;
1783                                         }
1784                                 } else
1785                                         used = urg_offset;
1786                         }
1787                 }
1788
1789                 if (!(flags&MSG_TRUNC)) {
1790                         err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, used);
1791                         if (err) {
1792                                 /* Exception. Bailout! */
1793                                 if (!copied)
1794                                         copied = -EFAULT;
1795                                 break;
1796                         }
1797                 }
1798
1799                 *seq += used;
1800                 copied += used;
1801                 len -= used;
1802
1803 skip_copy:
1804                 if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
1805                         tp->urg_data = 0;
1806                         tcp_fast_path_check(sk, tp);
1807                 }
1808                 if (used + offset < skb->len)
1809                         continue;
1810
1811                 if (skb->h.th->fin)
1812                         goto found_fin_ok;
1813                 if (!(flags & MSG_PEEK))
1814                         tcp_eat_skb(sk, skb);
1815                 continue;
1816
1817         found_fin_ok:
1818                 /* Process the FIN. */
1819                 ++*seq;
1820                 if (!(flags & MSG_PEEK))
1821                         tcp_eat_skb(sk, skb);
1822                 break;
1823         } while (len > 0);
1824
1825         if (user_recv) {
1826                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1827                         int chunk;
1828
1829                         tp->ucopy.len = copied > 0 ? len : 0;
1830
1831                         tcp_prequeue_process(sk);
1832
1833                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1834                                 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1835                                 len -= chunk;
1836                                 copied += chunk;
1837                         }
1838                 }
1839
1840                 tp->ucopy.task = NULL;
1841                 tp->ucopy.len = 0;
1842         }
1843
1844         /* According to UNIX98, msg_name/msg_namelen are ignored
1845          * on connected socket. I was just happy when found this 8) --ANK
1846          */
1847
1848         /* Clean up data we have read: This will do ACK frames. */
1849         cleanup_rbuf(sk, copied);
1850
1851         TCP_CHECK_TIMER(sk);
1852         release_sock(sk);
1853         return copied;
1854
1855 out:
1856         TCP_CHECK_TIMER(sk);
1857         release_sock(sk);
1858         return err;
1859
1860 recv_urg:
1861         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1862         goto out;
1863 #else
1864         return 0;
1865 #endif
1866 }
1867
1868 /*
1869  *      State processing on a close. This implements the state shift for
1870  *      sending our FIN frame. Note that we only send a FIN for some
1871  *      states. A shutdown() may have already sent the FIN, or we may be
1872  *      closed.
1873  */
1874
1875 static unsigned char new_state[16] = {
1876   /* current state:        new state:      action:      */
1877   /* (Invalid)          */ TCP_CLOSE,
1878   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1879   /* TCP_SYN_SENT       */ TCP_CLOSE,
1880   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1881   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1882   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1883   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1884   /* TCP_CLOSE          */ TCP_CLOSE,
1885   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1886   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1887   /* TCP_LISTEN         */ TCP_CLOSE,
1888   /* TCP_CLOSING        */ TCP_CLOSING,
1889 };
1890
1891 static int tcp_close_state(struct sock *sk)
1892 {
1893 #if 0
1894         int next = (int) new_state[sk->state];
1895         int ns = (next & TCP_STATE_MASK);
1896
1897         tcp_set_state(sk, ns);
1898
1899         return (next & TCP_ACTION_FIN);
1900 #else
1901         return 0;
1902 #endif
1903 }
1904
1905 /*
1906  *      Shutdown the sending side of a connection. Much like close except
1907  *      that we don't receive shut down or set sk->dead.
1908  */
1909
1910 void tcp_shutdown(struct sock *sk, int how)
1911 {
1912 #if 0
1913         /*      We need to grab some memory, and put together a FIN,
1914          *      and then put it into the queue to be sent.
1915          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1916          */
1917         if (!(how & SEND_SHUTDOWN))
1918                 return;
1919
1920         /* If we've already sent a FIN, or it's a closed state, skip this. */
1921         if ((1 << sk->state) &
1922             (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1923                 /* Clear out any half completed packets.  FIN if needed. */
1924                 if (tcp_close_state(sk))
1925                         tcp_send_fin(sk);
1926         }
1927 #endif
1928 }
1929
1930
1931 /*
1932  *      Return 1 if we still have things to send in our buffers.
1933  */
1934
1935 static inline int closing(struct sock * sk)
1936 {
1937 #if 0
1938         return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1939 #else
1940         return 0;
1941 #endif
1942 }
1943
1944 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1945 {
1946 #if 0
1947         /* First the read buffer. */
1948         __skb_queue_purge(&sk->receive_queue);
1949
1950         /* Next, the error queue. */
1951         __skb_queue_purge(&sk->error_queue);
1952
1953         /* Next, the write queue. */
1954         BUG_TRAP(skb_queue_empty(&sk->write_queue));
1955
1956         /* Account for returned memory. */
1957         tcp_mem_reclaim(sk);
1958
1959         BUG_TRAP(sk->wmem_queued == 0);
1960         BUG_TRAP(sk->forward_alloc == 0);
1961
1962         /* It is _impossible_ for the backlog to contain anything
1963          * when we get here.  All user references to this socket
1964          * have gone away, only the net layer knows can touch it.
1965          */
1966 #endif
1967 }
1968
1969 /*
1970  * At this point, there should be no process reference to this
1971  * socket, and thus no user references at all.  Therefore we
1972  * can assume the socket waitqueue is inactive and nobody will
1973  * try to jump onto it.
1974  */
1975 void tcp_destroy_sock(struct sock *sk)
1976 {
1977 #if 0
1978         BUG_TRAP(sk->state==TCP_CLOSE);
1979         BUG_TRAP(sk->dead);
1980
1981         /* It cannot be in hash table! */
1982         BUG_TRAP(sk->pprev==NULL);
1983
1984         /* If it has not 0 sk->num, it must be bound */
1985         BUG_TRAP(!sk->num || sk->prev!=NULL);
1986
1987 #ifdef TCP_DEBUG
1988         if (sk->zapped) {
1989                 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1990                 sock_hold(sk);
1991         }
1992         sk->zapped = 1;
1993 #endif
1994
1995         sk->prot->destroy(sk);
1996
1997         tcp_kill_sk_queues(sk);
1998
1999 #ifdef INET_REFCNT_DEBUG
2000         if (atomic_read(&sk->refcnt) != 1) {
2001                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
2002         }
2003 #endif
2004
2005         atomic_dec(&tcp_orphan_count);
2006         sock_put(sk);
2007 #endif
2008 }
2009
2010 void tcp_close(struct sock *sk, long timeout)
2011 {
2012 #if 0
2013         struct sk_buff *skb;
2014         int data_was_unread = 0;
2015
2016         lock_sock(sk);
2017         sk->shutdown = SHUTDOWN_MASK;
2018
2019         if(sk->state == TCP_LISTEN) {
2020                 tcp_set_state(sk, TCP_CLOSE);
2021
2022                 /* Special case. */
2023                 tcp_listen_stop(sk);
2024
2025                 goto adjudge_to_death;
2026         }
2027
2028         /*  We need to flush the recv. buffs.  We do this only on the
2029          *  descriptor close, not protocol-sourced closes, because the
2030          *  reader process may not have drained the data yet!
2031          */
2032         while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
2033                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
2034                 data_was_unread += len;
2035                 __kfree_skb(skb);
2036         }
2037
2038         tcp_mem_reclaim(sk);
2039
2040         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
2041          * 3.10, we send a RST here because data was lost.  To
2042          * witness the awful effects of the old behavior of always
2043          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
2044          * a bulk GET in an FTP client, suspend the process, wait
2045          * for the client to advertise a zero window, then kill -9
2046          * the FTP client, wheee...  Note: timeout is always zero
2047          * in such a case.
2048          */
2049         if(data_was_unread != 0) {
2050                 /* Unread data was tossed, zap the connection. */
2051                 NET_INC_STATS_USER(TCPAbortOnClose);
2052                 tcp_set_state(sk, TCP_CLOSE);
2053                 tcp_send_active_reset(sk, GFP_KERNEL);
2054         } else if (sk->linger && sk->lingertime==0) {
2055                 /* Check zero linger _after_ checking for unread data. */
2056                 sk->prot->disconnect(sk, 0);
2057                 NET_INC_STATS_USER(TCPAbortOnData);
2058         } else if (tcp_close_state(sk)) {
2059                 /* We FIN if the application ate all the data before
2060                  * zapping the connection.
2061                  */
2062
2063                 /* RED-PEN. Formally speaking, we have broken TCP state
2064                  * machine. State transitions:
2065                  *
2066                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
2067                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
2068                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
2069                  *
2070                  * are legal only when FIN has been sent (i.e. in window),
2071                  * rather than queued out of window. Purists blame.
2072                  *
2073                  * F.e. "RFC state" is ESTABLISHED,
2074                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2075                  *
2076                  * The visible declinations are that sometimes
2077                  * we enter time-wait state, when it is not required really
2078                  * (harmless), do not send active resets, when they are
2079                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2080                  * they look as CLOSING or LAST_ACK for Linux)
2081                  * Probably, I missed some more holelets.
2082                  *                                              --ANK
2083                  */
2084                 tcp_send_fin(sk);
2085         }
2086
2087         if (timeout) {
2088                 struct task_struct *tsk = current;
2089                 DECLARE_WAITQUEUE(wait, current);
2090
2091                 add_wait_queue(sk->sleep, &wait);
2092
2093                 do {
2094                         set_current_state(TASK_INTERRUPTIBLE);
2095                         if (!closing(sk))
2096                                 break;
2097                         release_sock(sk);
2098                         timeout = schedule_timeout(timeout);
2099                         lock_sock(sk);
2100                 } while (!signal_pending(tsk) && timeout);
2101
2102                 tsk->state = TASK_RUNNING;
2103                 remove_wait_queue(sk->sleep, &wait);
2104         }
2105
2106 adjudge_to_death:
2107         /* It is the last release_sock in its life. It will remove backlog. */
2108         release_sock(sk);
2109
2110
2111         /* Now socket is owned by kernel and we acquire BH lock
2112            to finish close. No need to check for user refs.
2113          */
2114         local_bh_disable();
2115         bh_lock_sock(sk);
2116         BUG_TRAP(sk->lock.users==0);
2117
2118         sock_hold(sk);
2119         sock_orphan(sk);
2120
2121         /*      This is a (useful) BSD violating of the RFC. There is a
2122          *      problem with TCP as specified in that the other end could
2123          *      keep a socket open forever with no application left this end.
2124          *      We use a 3 minute timeout (about the same as BSD) then kill
2125          *      our end. If they send after that then tough - BUT: long enough
2126          *      that we won't make the old 4*rto = almost no time - whoops
2127          *      reset mistake.
2128          *
2129          *      Nope, it was not mistake. It is really desired behaviour
2130          *      f.e. on http servers, when such sockets are useless, but
2131          *      consume significant resources. Let's do it with special
2132          *      linger2 option.                                 --ANK
2133          */
2134
2135         if (sk->state == TCP_FIN_WAIT2) {
2136                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2137                 if (tp->linger2 < 0) {
2138                         tcp_set_state(sk, TCP_CLOSE);
2139                         tcp_send_active_reset(sk, GFP_ATOMIC);
2140                         NET_INC_STATS_BH(TCPAbortOnLinger);
2141                 } else {
2142                         int tmo = tcp_fin_time(tp);
2143
2144                         if (tmo > TCP_TIMEWAIT_LEN) {
2145                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2146                         } else {
2147                                 atomic_inc(&tcp_orphan_count);
2148                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2149                                 goto out;
2150                         }
2151                 }
2152         }
2153         if (sk->state != TCP_CLOSE) {
2154                 tcp_mem_reclaim(sk);
2155                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2156                     (sk->wmem_queued > SOCK_MIN_SNDBUF &&
2157                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2158                         if (net_ratelimit())
2159                                 printk(KERN_INFO "TCP: too many of orphaned sockets\n");
2160                         tcp_set_state(sk, TCP_CLOSE);
2161                         tcp_send_active_reset(sk, GFP_ATOMIC);
2162                         NET_INC_STATS_BH(TCPAbortOnMemory);
2163                 }
2164         }
2165         atomic_inc(&tcp_orphan_count);
2166
2167         if (sk->state == TCP_CLOSE)
2168                 tcp_destroy_sock(sk);
2169         /* Otherwise, socket is reprieved until protocol close. */
2170
2171 out:
2172         bh_unlock_sock(sk);
2173         local_bh_enable();
2174         sock_put(sk);
2175 #endif
2176 }
2177
2178 /* These states need RST on ABORT according to RFC793 */
2179
2180 extern __inline__ int tcp_need_reset(int state)
2181 {
2182 #if 0
2183         return ((1 << state) &
2184                 (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
2185                  TCPF_FIN_WAIT2|TCPF_SYN_RECV));
2186 #else
2187         return 0;
2188 #endif
2189 }
2190
2191 int tcp_disconnect(struct sock *sk, int flags)
2192 {
2193 #if 0
2194         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2195         int old_state;
2196         int err = 0;
2197
2198         old_state = sk->state;
2199         if (old_state != TCP_CLOSE)
2200                 tcp_set_state(sk, TCP_CLOSE);
2201
2202         /* ABORT function of RFC793 */
2203         if (old_state == TCP_LISTEN) {
2204                 tcp_listen_stop(sk);
2205         } else if (tcp_need_reset(old_state) ||
2206                    (tp->snd_nxt != tp->write_seq &&
2207                     (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
2208                 /* The last check adjusts for discrepance of Linux wrt. RFC
2209                  * states
2210                  */
2211                 tcp_send_active_reset(sk, gfp_any());
2212                 sk->err = ECONNRESET;
2213         } else if (old_state == TCP_SYN_SENT)
2214                 sk->err = ECONNRESET;
2215
2216         tcp_clear_xmit_timers(sk);
2217         __skb_queue_purge(&sk->receive_queue);
2218         tcp_writequeue_purge(sk);
2219         __skb_queue_purge(&tp->out_of_order_queue);
2220
2221         sk->dport = 0;
2222
2223         if (!(sk->userlocks&SOCK_BINDADDR_LOCK)) {
2224                 sk->rcv_saddr = 0;
2225                 sk->saddr = 0;
2226 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
2227                 memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
2228                 memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
2229 #endif
2230         }
2231
2232         sk->shutdown = 0;
2233         sk->done = 0;
2234         tp->srtt = 0;
2235         if ((tp->write_seq += tp->max_window+2) == 0)
2236                 tp->write_seq = 1;
2237         tp->backoff = 0;
2238         tp->snd_cwnd = 2;
2239         tp->probes_out = 0;
2240         tp->packets_out = 0;
2241         tp->snd_ssthresh = 0x7fffffff;
2242         tp->snd_cwnd_cnt = 0;
2243         tp->ca_state = TCP_CA_Open;
2244         tcp_clear_retrans(tp);
2245         tcp_delack_init(tp);
2246         tp->send_head = NULL;
2247         tp->saw_tstamp = 0;
2248         tcp_sack_reset(tp);
2249         __sk_dst_reset(sk);
2250
2251         BUG_TRAP(!sk->num || sk->prev);
2252
2253         sk->error_report(sk);
2254         return err;
2255 #else
2256         return 0;
2257 #endif
2258 }
2259
2260 /*
2261  *      Wait for an incoming connection, avoid race
2262  *      conditions. This must be called with the socket locked.
2263  */
2264 static int wait_for_connect(struct sock * sk, long timeo)
2265 {
2266 #if 0
2267         DECLARE_WAITQUEUE(wait, current);
2268         int err;
2269
2270         /*
2271          * True wake-one mechanism for incoming connections: only
2272          * one process gets woken up, not the 'whole herd'.
2273          * Since we do not 'race & poll' for established sockets
2274          * anymore, the common case will execute the loop only once.
2275          *
2276          * Subtle issue: "add_wait_queue_exclusive()" will be added
2277          * after any current non-exclusive waiters, and we know that
2278          * it will always _stay_ after any new non-exclusive waiters
2279          * because all non-exclusive waiters are added at the
2280          * beginning of the wait-queue. As such, it's ok to "drop"
2281          * our exclusiveness temporarily when we get woken up without
2282          * having to remove and re-insert us on the wait queue.
2283          */
2284         add_wait_queue_exclusive(sk->sleep, &wait);
2285         for (;;) {
2286                 current->state = TASK_INTERRUPTIBLE;
2287                 release_sock(sk);
2288                 if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
2289                         timeo = schedule_timeout(timeo);
2290                 lock_sock(sk);
2291                 err = 0;
2292                 if (sk->tp_pinfo.af_tcp.accept_queue)
2293                         break;
2294                 err = -EINVAL;
2295                 if (sk->state != TCP_LISTEN)
2296                         break;
2297                 err = sock_intr_errno(timeo);
2298                 if (signal_pending(current))
2299                         break;
2300                 err = -EAGAIN;
2301                 if (!timeo)
2302                         break;
2303         }
2304         current->state = TASK_RUNNING;
2305         remove_wait_queue(sk->sleep, &wait);
2306         return err;
2307 #else
2308         return 0;
2309 #endif
2310 }
2311
2312 /*
2313  *      This will accept the next outstanding connection.
2314  */
2315
2316 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2317 {
2318 #if 0
2319         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2320         struct open_request *req;
2321         struct sock *newsk;
2322         int error;
2323
2324         lock_sock(sk);
2325
2326         /* We need to make sure that this socket is listening,
2327          * and that it has something pending.
2328          */
2329         error = -EINVAL;
2330         if (sk->state != TCP_LISTEN)
2331                 goto out;
2332
2333         /* Find already established connection */
2334         if (!tp->accept_queue) {
2335                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2336
2337                 /* If this is a non blocking socket don't sleep */
2338                 error = -EAGAIN;
2339                 if (!timeo)
2340                         goto out;
2341
2342                 error = wait_for_connect(sk, timeo);
2343                 if (error)
2344                         goto out;
2345         }
2346
2347         req = tp->accept_queue;
2348         if ((tp->accept_queue = req->dl_next) == NULL)
2349                 tp->accept_queue_tail = NULL;
2350
2351         newsk = req->sk;
2352         tcp_acceptq_removed(sk);
2353         tcp_openreq_fastfree(req);
2354         BUG_TRAP(newsk->state != TCP_SYN_RECV);
2355         release_sock(sk);
2356         return newsk;
2357
2358 out:
2359         release_sock(sk);
2360         *err = error;
2361         return NULL;
2362 #else
2363         return NULL;
2364 #endif
2365 }
2366
2367 /*
2368  *      Socket option code for TCP.
2369  */
2370
2371 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2372                    int optlen)
2373 {
2374 #if 0
2375         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2376         int val;
2377         int err = 0;
2378
2379         if (level != SOL_TCP)
2380                 return tp->af_specific->setsockopt(sk, level, optname,
2381                                                    optval, optlen);
2382
2383         if(optlen<sizeof(int))
2384                 return -EINVAL;
2385
2386         if (get_user(val, (int *)optval))
2387                 return -EFAULT;
2388
2389         lock_sock(sk);
2390
2391         switch(optname) {
2392         case TCP_MAXSEG:
2393                 /* values greater than interface MTU won't take effect.  however at
2394                  * the point when this call is done we typically don't yet know
2395                  * which interface is going to be used
2396                  */
2397                 if(val < 8 || val > MAX_TCP_WINDOW) {
2398                         err = -EINVAL;
2399                         break;
2400                 }
2401                 tp->user_mss = val;
2402                 break;
2403
2404         case TCP_NODELAY:
2405                 /* You cannot try to use this and TCP_CORK in
2406                  * tandem, so let the user know.
2407                  */
2408                 if (tp->nonagle == 2) {
2409                         err = -EINVAL;
2410                         break;
2411                 }
2412                 tp->nonagle = (val == 0) ? 0 : 1;
2413                 if (val)
2414                         tcp_push_pending_frames(sk, tp);
2415                 break;
2416
2417         case TCP_CORK:
2418                 /* When set indicates to always queue non-full frames.
2419                  * Later the user clears this option and we transmit
2420                  * any pending partial frames in the queue.  This is
2421                  * meant to be used alongside sendfile() to get properly
2422                  * filled frames when the user (for example) must write
2423                  * out headers with a write() call first and then use
2424                  * sendfile to send out the data parts.
2425                  *
2426                  * You cannot try to use TCP_NODELAY and this mechanism
2427                  * at the same time, so let the user know.
2428                  */
2429                 if (tp->nonagle == 1) {
2430                         err = -EINVAL;
2431                         break;
2432                 }
2433                 if (val != 0) {
2434                         tp->nonagle = 2;
2435                 } else {
2436                         tp->nonagle = 0;
2437
2438                         tcp_push_pending_frames(sk, tp);
2439                 }
2440                 break;
2441
2442         case TCP_KEEPIDLE:
2443                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2444                         err = -EINVAL;
2445                 else {
2446                         tp->keepalive_time = val * HZ;
2447                         if (sk->keepopen && !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) {
2448                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2449                                 if (tp->keepalive_time > elapsed)
2450                                         elapsed = tp->keepalive_time - elapsed;
2451                                 else
2452                                         elapsed = 0;
2453                                 tcp_reset_keepalive_timer(sk, elapsed);
2454                         }
2455                 }
2456                 break;
2457         case TCP_KEEPINTVL:
2458                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2459                         err = -EINVAL;
2460                 else
2461                         tp->keepalive_intvl = val * HZ;
2462                 break;
2463         case TCP_KEEPCNT:
2464                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2465                         err = -EINVAL;
2466                 else
2467                         tp->keepalive_probes = val;
2468                 break;
2469         case TCP_SYNCNT:
2470                 if (val < 1 || val > MAX_TCP_SYNCNT)
2471                         err = -EINVAL;
2472                 else
2473                         tp->syn_retries = val;
2474                 break;
2475
2476         case TCP_LINGER2:
2477                 if (val < 0)
2478                         tp->linger2 = -1;
2479                 else if (val > sysctl_tcp_fin_timeout/HZ)
2480                         tp->linger2 = 0;
2481                 else
2482                         tp->linger2 = val*HZ;
2483                 break;
2484
2485         case TCP_DEFER_ACCEPT:
2486                 tp->defer_accept = 0;
2487                 if (val > 0) {
2488                         /* Translate value in seconds to number of retransmits */
2489                         while (tp->defer_accept < 32 && val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
2490                                 tp->defer_accept++;
2491                         tp->defer_accept++;
2492                 }
2493                 break;
2494
2495         case TCP_WINDOW_CLAMP:
2496                 if (val==0) {
2497                         if (sk->state != TCP_CLOSE) {
2498                                 err = -EINVAL;
2499                                 break;
2500                         }
2501                         tp->window_clamp = 0;
2502                 } else {
2503                         tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
2504                                 SOCK_MIN_RCVBUF/2 : val;
2505                 }
2506                 break;
2507
2508         case TCP_QUICKACK:
2509                 if (!val) {
2510                         tp->ack.pingpong = 1;
2511                 } else {
2512                         tp->ack.pingpong = 0;
2513                         if ((1<<sk->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT) &&
2514                             tcp_ack_scheduled(tp)) {
2515                                 tp->ack.pending |= TCP_ACK_PUSHED;
2516                                 cleanup_rbuf(sk, 1);
2517                                 if (!(val & 1))
2518                                         tp->ack.pingpong = 1;
2519                         }
2520                 }
2521                 break;
2522
2523         default:
2524                 err = -ENOPROTOOPT;
2525                 break;
2526         };
2527         release_sock(sk);
2528         return err;
2529 #else
2530         return 0;
2531 #endif
2532 }
2533
2534 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2535                    int *optlen)
2536 {
2537 #if 0
2538         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2539         int val, len;
2540
2541         if(level != SOL_TCP)
2542                 return tp->af_specific->getsockopt(sk, level, optname,
2543                                                    optval, optlen);
2544
2545         if(get_user(len,optlen))
2546                 return -EFAULT;
2547
2548         len = min_t(unsigned int, len, sizeof(int));
2549
2550         if(len < 0)
2551                 return -EINVAL;
2552
2553         switch(optname) {
2554         case TCP_MAXSEG:
2555                 val = tp->mss_cache;
2556                 if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
2557                         val = tp->user_mss;
2558                 break;
2559         case TCP_NODELAY:
2560                 val = (tp->nonagle == 1);
2561                 break;
2562         case TCP_CORK:
2563                 val = (tp->nonagle == 2);
2564                 break;
2565         case TCP_KEEPIDLE:
2566                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
2567                 break;
2568         case TCP_KEEPINTVL:
2569                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
2570                 break;
2571         case TCP_KEEPCNT:
2572                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2573                 break;
2574         case TCP_SYNCNT:
2575                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2576                 break;
2577         case TCP_LINGER2:
2578                 val = tp->linger2;
2579                 if (val >= 0)
2580                         val = (val ? : sysctl_tcp_fin_timeout)/HZ;
2581                 break;
2582         case TCP_DEFER_ACCEPT:
2583                 val = tp->defer_accept == 0 ? 0 : ((TCP_TIMEOUT_INIT/HZ)<<(tp->defer_accept-1));
2584                 break;
2585         case TCP_WINDOW_CLAMP:
2586                 val = tp->window_clamp;
2587                 break;
2588         case TCP_INFO:
2589         {
2590                 struct tcp_info info;
2591                 u32 now = tcp_time_stamp;
2592
2593                 if(get_user(len,optlen))
2594                         return -EFAULT;
2595                 info.tcpi_state = sk->state;
2596                 info.tcpi_ca_state = tp->ca_state;
2597                 info.tcpi_retransmits = tp->retransmits;
2598                 info.tcpi_probes = tp->probes_out;
2599                 info.tcpi_backoff = tp->backoff;
2600                 info.tcpi_options = 0;
2601                 if (tp->tstamp_ok)
2602                         info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2603                 if (tp->sack_ok)
2604                         info.tcpi_options |= TCPI_OPT_SACK;
2605                 if (tp->wscale_ok) {
2606                         info.tcpi_options |= TCPI_OPT_WSCALE;
2607                         info.tcpi_snd_wscale = tp->snd_wscale;
2608                         info.tcpi_rcv_wscale = tp->rcv_wscale;
2609                 } else {
2610                         info.tcpi_snd_wscale = 0;
2611                         info.tcpi_rcv_wscale = 0;
2612                 }
2613                 if (tp->ecn_flags&TCP_ECN_OK)
2614                         info.tcpi_options |= TCPI_OPT_ECN;
2615
2616                 info.tcpi_rto = (1000000*tp->rto)/HZ;
2617                 info.tcpi_ato = (1000000*tp->ack.ato)/HZ;
2618                 info.tcpi_snd_mss = tp->mss_cache;
2619                 info.tcpi_rcv_mss = tp->ack.rcv_mss;
2620
2621                 info.tcpi_unacked = tp->packets_out;
2622                 info.tcpi_sacked = tp->sacked_out;
2623                 info.tcpi_lost = tp->lost_out;
2624                 info.tcpi_retrans = tp->retrans_out;
2625                 info.tcpi_fackets = tp->fackets_out;
2626
2627                 info.tcpi_last_data_sent = ((now - tp->lsndtime)*1000)/HZ;
2628                 info.tcpi_last_ack_sent = 0;
2629                 info.tcpi_last_data_recv = ((now - tp->ack.lrcvtime)*1000)/HZ;
2630                 info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp)*1000)/HZ;
2631
2632                 info.tcpi_pmtu = tp->pmtu_cookie;
2633                 info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2634                 info.tcpi_rtt = ((1000000*tp->srtt)/HZ)>>3;
2635                 info.tcpi_rttvar = ((1000000*tp->mdev)/HZ)>>2;
2636                 info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2637                 info.tcpi_snd_cwnd = tp->snd_cwnd;
2638                 info.tcpi_advmss = tp->advmss;
2639                 info.tcpi_reordering = tp->reordering;
2640
2641                 len = min_t(unsigned int, len, sizeof(info));
2642                 if(put_user(len, optlen))
2643                         return -EFAULT;
2644                 if(copy_to_user(optval, &info,len))
2645                         return -EFAULT;
2646                 return 0;
2647         }
2648         case TCP_QUICKACK:
2649                 val = !tp->ack.pingpong;
2650                 break;
2651         default:
2652                 return -ENOPROTOOPT;
2653         };
2654
2655         if(put_user(len, optlen))
2656                 return -EFAULT;
2657         if(copy_to_user(optval, &val,len))
2658                 return -EFAULT;
2659         return 0;
2660 #else
2661         return 0;
2662 #endif
2663 }
2664
2665
2666 //extern void __skb_cb_too_small_for_tcp(int, int);
2667 //extern void tcpdiag_init(void);
2668
2669 void /* __init */ tcp_init(void)
2670 {
2671 #if 0
2672         struct sk_buff *skb = NULL;
2673         unsigned long goal;
2674         int order, i;
2675
2676         if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2677                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2678                                            sizeof(skb->cb));
2679
2680         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2681                                                    sizeof(struct open_request),
2682                                                0, SLAB_HWCACHE_ALIGN,
2683                                                NULL, NULL);
2684         if(!tcp_openreq_cachep)
2685                 panic("tcp_init: Cannot alloc open_request cache.");
2686
2687         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2688                                               sizeof(struct tcp_bind_bucket),
2689                                               0, SLAB_HWCACHE_ALIGN,
2690                                               NULL, NULL);
2691         if(!tcp_bucket_cachep)
2692                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2693
2694         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2695                                                 sizeof(struct tcp_tw_bucket),
2696                                                 0, SLAB_HWCACHE_ALIGN,
2697                                                 NULL, NULL);
2698         if(!tcp_timewait_cachep)
2699                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2700
2701         /* Size and allocate the main established and bind bucket
2702          * hash tables.
2703          *
2704          * The methodology is similar to that of the buffer cache.
2705          */
2706         if (num_physpages >= (128 * 1024))
2707                 goal = num_physpages >> (21 - PAGE_SHIFT);
2708         else
2709                 goal = num_physpages >> (23 - PAGE_SHIFT);
2710
2711         for(order = 0; (1UL << order) < goal; order++)
2712                 ;
2713         do {
2714                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2715                         sizeof(struct tcp_ehash_bucket);
2716                 tcp_ehash_size >>= 1;
2717                 while (tcp_ehash_size & (tcp_ehash_size-1))
2718                         tcp_ehash_size--;
2719                 tcp_ehash = (struct tcp_ehash_bucket *)
2720                         __get_free_pages(GFP_ATOMIC, order);
2721         } while (tcp_ehash == NULL && --order > 0);
2722
2723         if (!tcp_ehash)
2724                 panic("Failed to allocate TCP established hash table\n");
2725         for (i = 0; i < (tcp_ehash_size<<1); i++) {
2726                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2727                 tcp_ehash[i].chain = NULL;
2728         }
2729
2730         do {
2731                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2732                         sizeof(struct tcp_bind_hashbucket);
2733                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2734                         continue;
2735                 tcp_bhash = (struct tcp_bind_hashbucket *)
2736                         __get_free_pages(GFP_ATOMIC, order);
2737         } while (tcp_bhash == NULL && --order >= 0);
2738
2739         if (!tcp_bhash)
2740                 panic("Failed to allocate TCP bind hash table\n");
2741         for (i = 0; i < tcp_bhash_size; i++) {
2742                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2743                 tcp_bhash[i].chain = NULL;
2744         }
2745
2746         /* Try to be a bit smarter and adjust defaults depending
2747          * on available memory.
2748          */
2749         if (order > 4) {
2750                 sysctl_local_port_range[0] = 32768;
2751                 sysctl_local_port_range[1] = 61000;
2752                 sysctl_tcp_max_tw_buckets = 180000;
2753                 sysctl_tcp_max_orphans = 4096<<(order-4);
2754                 sysctl_max_syn_backlog = 1024;
2755         } else if (order < 3) {
2756                 sysctl_local_port_range[0] = 1024*(3-order);
2757                 sysctl_tcp_max_tw_buckets >>= (3-order);
2758                 sysctl_tcp_max_orphans >>= (3-order);
2759                 sysctl_max_syn_backlog = 128;
2760         }
2761         tcp_port_rover = sysctl_local_port_range[0] - 1;
2762
2763         sysctl_tcp_mem[0] = 768<<order;
2764         sysctl_tcp_mem[1] = 1024<<order;
2765         sysctl_tcp_mem[2] = 1536<<order;
2766         if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 512)
2767                 sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 512;
2768         if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 512)
2769                 sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 512;
2770
2771         if (order < 3) {
2772                 sysctl_tcp_wmem[2] = 64*1024;
2773                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2774                 sysctl_tcp_rmem[1] = 43689;
2775                 sysctl_tcp_rmem[2] = 2*43689;
2776         }
2777
2778         printk(KERN_INFO "TCP: Hash tables configured (established %d bind %d)\n",
2779                tcp_ehash_size<<1, tcp_bhash_size);
2780
2781         tcpdiag_init();
2782 #endif
2783 }