9 files changed, 376 insertions, 97 deletions
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index eeb5b2e97bed..83bf4986baea 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -2230,11 +2230,8 @@ balance-rr: This mode is the only mode that will permit a single
 
 	It is possible to adjust TCP/IP's congestion limits by
 	altering the net.ipv4.tcp_reordering sysctl parameter.  The
-	usual default value is 3, and the maximum useful value is 127.
-	For a four interface balance-rr bond, expect that a single
-	TCP/IP stream will utilize no more than approximately 2.3
-	interface's worth of throughput, even after adjusting
-	tcp_reordering.
+	usual default value is 3. But keep in mind TCP stack is able
+	to automatically increase this when it detects reorders.
 
 	Note that the fraction of packets that will be delivered out of
 	order is highly variable, and is unlikely to be zero.  The level
diff --git a/Documentation/networking/fib_trie.txt b/Documentation/networking/fib_trie.txt
index 0723db7f8495..fe719388518b 100644
--- a/Documentation/networking/fib_trie.txt
+++ b/Documentation/networking/fib_trie.txt
@@ -73,8 +73,8 @@ trie_leaf_remove()
 
 trie_rebalance()
 	The key function for the dynamic trie after any change in the trie
-	it is run to optimize and reorganize. Tt will walk the trie upwards 
-	towards the root from a given tnode, doing a resize() at each step 
+	it is run to optimize and reorganize. It will walk the trie upwards
+	towards the root from a given tnode, doing a resize() at each step
 	to implement level compression.
 
 resize()
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 0307e2875f21..9bffdfc648dc 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -56,6 +56,13 @@ ip_forward_use_pmtu - BOOLEAN
 	0 - disabled
 	1 - enabled
 
+fwmark_reflect - BOOLEAN
+	Controls the fwmark of kernel-generated IPv4 reply packets that are not
+	associated with a socket for example, TCP RSTs or ICMP echo replies).
+	If unset, these packets have a fwmark of zero. If set, they have the
+	fwmark of the packet they are replying to.
+	Default: 0
+
 route/max_size - INTEGER
 	Maximum number of routes allowed in the kernel.  Increase
 	this when using large numbers of interfaces and/or routes.
@@ -376,9 +383,17 @@ tcp_orphan_retries - INTEGER
 	may consume significant resources. Cf. tcp_max_orphans.
 
 tcp_reordering - INTEGER
-	Maximal reordering of packets in a TCP stream.
+	Initial reordering level of packets in a TCP stream.
+	TCP stack can then dynamically adjust flow reordering level
+	between this initial value and tcp_max_reordering
 	Default: 3
 
+tcp_max_reordering - INTEGER
+	Maximal reordering level of packets in a TCP stream.
+	300 is a fairly conservative value, but you might increase it
+	if paths are using per packet load balancing (like bonding rr mode)
+	Default: 300
+
 tcp_retrans_collapse - BOOLEAN
 	Bug-to-bug compatibility with some broken printers.
 	On retransmit try to send bigger packets to work around bugs in
@@ -1201,6 +1216,13 @@ conf/all/forwarding - BOOLEAN
 proxy_ndp - BOOLEAN
 	Do proxy ndp.
 
+fwmark_reflect - BOOLEAN
+	Controls the fwmark of kernel-generated IPv6 reply packets that are not
+	associated with a socket for example, TCP RSTs or ICMPv6 echo replies).
+	If unset, these packets have a fwmark of zero. If set, they have the
+	fwmark of the packet they are replying to.
+	Default: 0
+
 conf/interface/*:
 	Change special settings per interface.
 
@@ -1452,6 +1474,19 @@ suppress_frag_ndisc - INTEGER
 	1 - (default) discard fragmented neighbor discovery packets
 	0 - allow fragmented neighbor discovery packets
 
+optimistic_dad - BOOLEAN
+	Whether to perform Optimistic Duplicate Address Detection (RFC 4429).
+		0: disabled (default)
+		1: enabled
+
+use_optimistic - BOOLEAN
+	If enabled, do not classify optimistic addresses as deprecated during
+	source address selection.  Preferred addresses will still be chosen
+	before optimistic addresses, subject to other ranking in the source
+	address selection algorithm.
+		0: disabled (default)
+		1: enabled
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt
new file mode 100644
index 000000000000..cf996394e466
--- /dev/null
+++ b/Documentation/networking/ipvlan.txt
@@ -0,0 +1,107 @@
+
+                            IPVLAN Driver HOWTO
+
+Initial Release:
+	Mahesh Bandewar <maheshb AT google.com>
+
+1. Introduction:
+	This is conceptually very similar to the macvlan driver with one major
+exception of using L3 for mux-ing /demux-ing among slaves. This property makes
+the master device share the L2 with it's slave devices. I have developed this
+driver in conjuntion with network namespaces and not sure if there is use case
+outside of it.
+
+
+2. Building and Installation:
+	In order to build the driver, please select the config item CONFIG_IPVLAN.
+The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module
+(CONFIG_IPVLAN=m).
+
+
+3. Configuration:
+	There are no module parameters for this driver and it can be configured
+using IProute2/ip utility.
+
+	ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | L3 }
+
+	e.g. ip link add link ipvl0 eth0 type ipvlan mode l2
+
+
+4. Operating modes:
+	IPvlan has two modes of operation - L2 and L3. For a given master device,
+you can select one of these two modes and all slaves on that master will
+operate in the same (selected) mode. The RX mode is almost identical except
+that in L3 mode the slaves wont receive any multicast / broadcast traffic.
+L3 mode is more restrictive since routing is controlled from the other (mostly)
+default namespace.
+
+4.1 L2 mode:
+	In this mode TX processing happens on the stack instance attached to the
+slave device and packets are switched and queued to the master device to send
+out. In this mode the slaves will RX/TX multicast and broadcast (if applicable)
+as well.
+
+4.2 L3 mode:
+	In this mode TX processing upto L3 happens on the stack instance attached
+to the slave device and packets are switched to the stack instance of the
+master device for the L2 processing and routing from that instance will be
+used before packets are queued on the outbound device. In this mode the slaves
+will not receive nor can send multicast / broadcast traffic.
+
+
+5. What to choose (macvlan vs. ipvlan)?
+	These two devices are very similar in many regards and the specific use
+case could very well define which device to choose. if one of the following
+situations defines your use case then you can choose to use ipvlan -
+	(a) The Linux host that is connected to the external switch / router has
+policy configured that allows only one mac per port.
+	(b) No of virtual devices created on a master exceed the mac capacity and
+puts the NIC in promiscous mode and degraded performance is a concern.
+	(c) If the slave device is to be put into the hostile / untrusted network
+namespace where L2 on the slave could be changed / misused.
+
+
+6. Example configuration:
+
+  +=============================================================+
+  |  Host: host1                                                |
+  |                                                             |
+  |   +----------------------+      +----------------------+    |
+  |   |   NS:ns0             |      |  NS:ns1              |    |
+  |   |                      |      |                      |    |
+  |   |                      |      |                      |    |
+  |   |        ipvl0         |      |         ipvl1        |    |
+  |   +----------#-----------+      +-----------#----------+    |
+  |              #                              #               |
+  |              ################################               |
+  |                              # eth0                         |
+  +==============================#==============================+
+
+
+	(a) Create two network namespaces - ns0, ns1
+		ip netns add ns0
+		ip netns add ns1
+
+	(b) Create two ipvlan slaves on eth0 (master device)
+		ip link add link eth0 ipvl0 type ipvlan mode l2
+		ip link add link eth0 ipvl1 type ipvlan mode l2
+
+	(c) Assign slaves to the respective network namespaces
+		ip link set dev ipvl0 netns ns0
+		ip link set dev ipvl1 netns ns1
+
+	(d) Now switch to the namespace (ns0 or ns1) to configure the slave devices
+		- For ns0
+			(1) ip netns exec ns0 bash
+			(2) ip link set dev ipvl0 up
+			(3) ip link set dev lo up
+			(4) ip -4 addr add 127.0.0.1 dev lo
+			(5) ip -4 addr add $IPADDR dev ipvl0
+			(6) ip -4 route add default via $ROUTER dev ipvl0
+		- For ns1
+			(1) ip netns exec ns1 bash
+			(2) ip link set dev ipvl1 up
+			(3) ip link set dev lo up
+			(4) ip -4 addr add 127.0.0.1 dev lo
+			(5) ip -4 addr add $IPADDR dev ipvl1
+			(6) ip -4 route add default via $ROUTER dev ipvl1
diff --git a/Documentation/networking/ixgbe.txt b/Documentation/networking/ixgbe.txt
index 96cccebb839b..0ace6e776ac8 100644
--- a/Documentation/networking/ixgbe.txt
+++ b/Documentation/networking/ixgbe.txt
@@ -138,7 +138,7 @@ Other ethtool Commands:
 To enable Flow Director
 	ethtool -K ethX ntuple on
 To add a filter
-	Use -U switch. e.g., ethtool -U ethX flow-type tcp4 src-ip 0x178000a
+	Use -U switch. e.g., ethtool -U ethX flow-type tcp4 src-ip 10.0.128.23
         action 1
 To see the list of filters currently present:
 	ethtool -u ethX
diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index 2090895b08d4..e655e2453c98 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -1,12 +1,12 @@
        STMicroelectronics 10/100/1000 Synopsys Ethernet driver
 
-Copyright (C) 2007-2013  STMicroelectronics Ltd
+Copyright (C) 2007-2014  STMicroelectronics Ltd
 Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
 
 This is the driver for the MAC 10/100/1000 on-chip Ethernet controllers
 (Synopsys IP blocks).
 
-Currently this network device driver is for all STM embedded MAC/GMAC
+Currently this network device driver is for all STi embedded MAC/GMAC
 (i.e. 7xxx/5xxx SoCs), SPEAr (arm), Loongson1B (mips) and XLINX XC2V3000
 FF1152AMT0221 D1215994A VIRTEX FPGA board.
 
@@ -22,6 +22,9 @@ The kernel configuration option is STMMAC_ETH:
  Device Drivers ---> Network device support ---> Ethernet (1000 Mbit) --->
  STMicroelectronics 10/100/1000 Ethernet driver (STMMAC_ETH)
 
+CONFIG_STMMAC_PLATFORM: is to enable the platform driver.
+CONFIG_STMMAC_PCI: is to enable the pci driver.
+
 2) Driver parameters list:
 	debug: message level (0: no output, 16: all);
 	phyaddr: to manually provide the physical address to the PHY device;
@@ -45,10 +48,11 @@ Driver parameters can be also passed in command line by using:
 The xmit method is invoked when the kernel needs to transmit a packet; it sets
 the descriptors in the ring and informs the DMA engine that there is a packet
 ready to be transmitted.
-Once the controller has finished transmitting the packet, an interrupt is
-triggered; So the driver will be able to release the socket buffers.
 By default, the driver sets the NETIF_F_SG bit in the features field of the
-net_device structure enabling the scatter/gather feature.
+net_device structure enabling the scatter-gather feature. This is true on
+chips and configurations where the checksum can be done in hardware.
+Once the controller has finished transmitting the packet, napi will be
+scheduled to release the transmit resources.
 
 4.2) Receive process
 When one or more packets are received, an interrupt happens. The interrupts
@@ -58,20 +62,12 @@ This is based on NAPI so the interrupt handler signals only if there is work
 to be done, and it exits.
 Then the poll method will be scheduled at some future point.
 The incoming packets are stored, by the DMA, in a list of pre-allocated socket
-buffers in order to avoid the memcpy (Zero-copy).
+buffers in order to avoid the memcpy (zero-copy).
 
 4.3) Interrupt Mitigation
 The driver is able to mitigate the number of its DMA interrupts
 using NAPI for the reception on chips older than the 3.50.
 New chips have an HW RX-Watchdog used for this mitigation.
-
-On Tx-side, the mitigation schema is based on a SW timer that calls the
-tx function (stmmac_tx) to reclaim the resource after transmitting the
-frames.
-Also there is another parameter (like a threshold) used to program
-the descriptors avoiding to set the interrupt on completion bit in
-when the frame is sent (xmit).
-
 Mitigation parameters can be tuned by ethtool.
 
 4.4) WOL
@@ -79,7 +75,7 @@ Wake up on Lan feature through Magic and Unicast frames are supported for the
 GMAC core.
 
 4.5) DMA descriptors
-Driver handles both normal and enhanced descriptors. The latter has been only
+Driver handles both normal and alternate descriptors. The latter has been only
 tested on DWC Ether MAC 10/100/1000 Universal version 3.41a and later.
 
 STMMAC supports DMA descriptor to operate both in dual buffer (RING)
@@ -91,9 +87,20 @@ In CHAINED mode each descriptor will have pointer to next descriptor in
 the list, hence creating the explicit chaining in the descriptor itself,
 whereas such explicit chaining is not possible in RING mode.
 
+4.5.1) Extended descriptors
+	The extended descriptors give us information about the Ethernet payload
+	when it is carrying PTP packets or TCP/UDP/ICMP over IP.
+	These are not available on GMAC Synopsys chips older than the 3.50.
+	At probe time the driver will decide if these can be actually used.
+	This support also is mandatory for PTPv2 because the extra descriptors
+	are used for saving the hardware timestamps and Extended Status.
+
 4.6) Ethtool support
-Ethtool is supported. Driver statistics and internal errors can be taken using:
-ethtool -S ethX command. It is possible to dump registers etc.
+Ethtool is supported.
+
+For example, driver statistics (including RMON), internal errors can be taken
+using:
+  # ethtool -S ethX command
 
 4.7) Jumbo and Segmentation Offloading
 Jumbo frames are supported and tested for the GMAC.
@@ -101,12 +108,11 @@ The GSO has been also added but it's performed in software.
 LRO is not supported.
 
 4.8) Physical
-The driver is compatible with PAL to work with PHY and GPHY devices.
+The driver is compatible with Physical Abstraction Layer to be connected with
+PHY and GPHY devices.
 
 4.9) Platform information
-Several driver's information can be passed through the platform
-These are included in the include/linux/stmmac.h header file
-and detailed below as well:
+Several information can be passed through the platform and device-tree.
 
 struct plat_stmmacenet_data {
 	char *phy_bus_name;
@@ -125,15 +131,18 @@ struct plat_stmmacenet_data {
 	int force_sf_dma_mode;
 	int force_thresh_dma_mode;
 	int riwt_off;
+	int max_speed;
+	int maxmtu;
 	void (*fix_mac_speed)(void *priv, unsigned int speed);
 	void (*bus_setup)(void __iomem *ioaddr);
 	void *(*setup)(struct platform_device *pdev);
+	void (*free)(struct platform_device *pdev, void *priv);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
 	void *custom_cfg;
 	void *custom_data;
 	void *bsp_priv;
- };
+};
 
 Where:
  o phy_bus_name: phy bus name to attach to the stmmac.
@@ -258,32 +267,43 @@ and the second one, with a real PHY device attached to the bus,
 by using the stmmac_mdio_bus_data structure (to provide the id, the
 reset procedure etc).
 
-4.10) List of source files:
- o Kconfig
- o Makefile
- o stmmac_main.c: main network device driver;
- o stmmac_mdio.c: mdio functions;
- o stmmac_pci: PCI driver;
- o stmmac_platform.c: platform driver
- o stmmac_ethtool.c: ethtool support;
- o stmmac_timer.[ch]: timer code used for mitigating the driver dma interrupts
-		      (only tested on ST40 platforms based);
+Note that, starting from new chips, where it is available the HW capability
+register, many configurations are discovered at run-time for example to
+understand if EEE, HW csum, PTP, enhanced descriptor etc are actually
+available. As strategy adopted in this driver, the information from the HW
+capability register can replace what has been passed from the platform.
+
+4.10) Device-tree support.
+
+Please see the following document:
+	Documentation/devicetree/bindings/net/stmmac.txt
+
+and the stmmac_of_data structure inside the include/linux/stmmac.h header file.
+
+4.11) This is a summary of the content of some relevant files:
+ o stmmac_main.c: to implement the main network device driver;
+ o stmmac_mdio.c: to provide mdio functions;
+ o stmmac_pci: this the PCI driver;
+ o stmmac_platform.c: this the platform driver (OF supported)
+ o stmmac_ethtool.c: to implement the ethtool support;
  o stmmac.h: private driver structure;
  o common.h: common definitions and VFTs;
  o descs.h: descriptor structure definitions;
- o dwmac1000_core.c: GMAC core functions;
- o dwmac1000_dma.c:  dma functions for the GMAC chip;
- o dwmac1000.h: specific header file for the GMAC;
- o dwmac100_core: MAC 100 core and dma code;
- o dwmac100_dma.c: dma functions for the MAC chip;
+ o dwmac1000_core.c: dwmac GiGa core functions;
+ o dwmac1000_dma.c: dma functions for the GMAC chip;
+ o dwmac1000.h: specific header file for the dwmac GiGa;
+ o dwmac100_core: dwmac 100 core code;
+ o dwmac100_dma.c: dma functions for the dwmac 100 chip;
  o dwmac1000.h: specific header file for the MAC;
- o dwmac_lib.c: generic DMA functions shared among chips;
+ o dwmac_lib.c: generic DMA functions;
  o enh_desc.c: functions for handling enhanced descriptors;
  o norm_desc.c: functions for handling normal descriptors;
  o chain_mode.c/ring_mode.c:: functions to manage RING/CHAINED modes;
  o mmc_core.c/mmc.h: Management MAC Counters;
- o stmmac_hwtstamp.c: HW timestamp support for PTP
- o stmmac_ptp.c: PTP 1588 clock
+ o stmmac_hwtstamp.c: HW timestamp support for PTP;
+ o stmmac_ptp.c: PTP 1588 clock;
+ o dwmac-<XXX>.c: these are for the platform glue-logic file; e.g. dwmac-sti.c
+   for STMicroelectronics SoCs.
 
 5) Debug Information
 
@@ -298,23 +318,14 @@ to get statistics: e.g. using: ethtool -S ethX
 (that shows the Management counters (MMC) if supported)
 or sees the MAC/DMA registers: e.g. using: ethtool -d ethX
 
-Compiling the Kernel with CONFIG_DEBUG_FS and enabling the
-STMMAC_DEBUG_FS option the driver will export the following
+Compiling the Kernel with CONFIG_DEBUG_FS the driver will export the following
 debugfs entries:
 
 /sys/kernel/debug/stmmaceth/descriptors_status
   To show the DMA TX/RX descriptor rings
 
-Developer can also use the "debug" module parameter to get
-further debug information.
-
-In the end, there are other macros (that cannot be enabled
-via menuconfig) to turn-on the RX/TX DMA debugging,
-specific MAC core debug printk etc. Others to enable the
-debug in the TX and RX processes.
-All these are only useful during the developing stage
-and should never enabled inside the code for general usage.
-In fact, these can generate an huge amount of debug messages.
+Developer can also use the "debug" module parameter to get further debug
+information (please see: NETIF Msg Level).
 
 6) Energy Efficient Ethernet
 
@@ -337,15 +348,7 @@ To enter in Tx LPI mode the driver needs to have a software timer
 that enable and disable the LPI mode when there is nothing to be
 transmitted.
 
-7) Extended descriptors
-The extended descriptors give us information about the receive Ethernet payload
-when it is carrying PTP packets or TCP/UDP/ICMP over IP.
-These are not available on GMAC Synopsys chips older than the 3.50.
-At probe time the driver will decide if these can be actually used.
-This support also is mandatory for PTPv2 because the extra descriptors 6 and 7
-are used for saving the hardware timestamps.
-
-8) Precision Time Protocol (PTP)
+7) Precision Time Protocol (PTP)
 The driver supports the IEEE 1588-2002, Precision Time Protocol (PTP),
 which enables precise synchronization of clocks in measurement and
 control systems implemented with technologies such as network
@@ -355,7 +358,7 @@ In addition to the basic timestamp features mentioned in IEEE 1588-2002
 Timestamps, new GMAC cores support the advanced timestamp features.
 IEEE 1588-2008 that can be enabled when configure the Kernel.
 
-9) SGMII/RGMII supports
+8) SGMII/RGMII supports
 New GMAC devices provide own way to manage RGMII/SGMII.
 This information is available at run-time by looking at the
 HW capability register. This means that the stmmac can manage
@@ -364,8 +367,3 @@ In fact, the HW provides a subset of extended registers to
 restart the ANE, verify Full/Half duplex mode and Speed.
 Also thanks to these registers it is possible to look at the
 Auto-negotiated Link Parter Ability.
-
-10) TODO:
- o XGMAC is not supported.
- o Complete the TBI & RTBI support.
- o extend VLAN support for 3.70a SYNP GMAC.
diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt
new file mode 100644
index 000000000000..f981a9295a39
--- /dev/null
+++ b/Documentation/networking/switchdev.txt
@@ -0,0 +1,59 @@
+Switch (and switch-ish) device drivers HOWTO
+===========================
+
+Please note that the word "switch" is here used in very generic meaning.
+This include devices supporting L2/L3 but also various flow offloading chips,
+including switches embedded into SR-IOV NICs.
+
+Lets describe a topology a bit. Imagine the following example:
+
+       +----------------------------+    +---------------+
+       |     SOME switch chip       |    |      CPU      |
+       +----------------------------+    +---------------+
+       port1 port2 port3 port4 MNGMNT    |     PCI-E     |
+         |     |     |     |     |       +---------------+
+        PHY   PHY    |     |     |         |  NIC0 NIC1
+                     |     |     |         |   |    |
+                     |     |     +- PCI-E -+   |    |
+                     |     +------- MII -------+    |
+                     +------------- MII ------------+
+
+In this example, there are two independent lines between the switch silicon
+and CPU. NIC0 and NIC1 drivers are not aware of a switch presence. They are
+separate from the switch driver. SOME switch chip is by managed by a driver
+via PCI-E device MNGMNT. Note that MNGMNT device, NIC0 and NIC1 may be
+connected to some other type of bus.
+
+Now, for the previous example show the representation in kernel:
+
+       +----------------------------+    +---------------+
+       |     SOME switch chip       |    |      CPU      |
+       +----------------------------+    +---------------+
+       sw0p0 sw0p1 sw0p2 sw0p3 MNGMNT    |     PCI-E     |
+         |     |     |     |     |       +---------------+
+        PHY   PHY    |     |     |         |  eth0 eth1
+                     |     |     |         |   |    |
+                     |     |     +- PCI-E -+   |    |
+                     |     +------- MII -------+    |
+                     +------------- MII ------------+
+
+Lets call the example switch driver for SOME switch chip "SOMEswitch". This
+driver takes care of PCI-E device MNGMNT. There is a netdevice instance sw0pX
+created for each port of a switch. These netdevices are instances
+of "SOMEswitch" driver. sw0pX netdevices serve as a "representation"
+of the switch chip. eth0 and eth1 are instances of some other existing driver.
+
+The only difference of the switch-port netdevice from the ordinary netdevice
+is that is implements couple more NDOs:
+
+  ndo_switch_parent_id_get - This returns the same ID for two port netdevices
+			     of the same physical switch chip. This is
+			     mandatory to be implemented by all switch drivers
+			     and serves the caller for recognition of a port
+			     netdevice.
+  ndo_switch_parent_* - Functions that serve for a manipulation of the switch
+			chip itself (it can be though of as a "parent" of the
+			port, therefore the name). They are not port-specific.
+			Caller might use arbitrary port netdevice of the same
+			switch and it will make no difference.
+  ndo_switch_port_* - Functions that serve for a port-specific manipulation.
diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
index 412f45ca2d73..a5c784c89312 100644
--- a/Documentation/networking/timestamping.txt
+++ b/Documentation/networking/timestamping.txt
@@ -122,7 +122,7 @@ SOF_TIMESTAMPING_RAW_HARDWARE:
 
 1.3.3 Timestamp Options
 
-The interface supports one option
+The interface supports the options
 
 SOF_TIMESTAMPING_OPT_ID:
 
@@ -130,19 +130,36 @@ SOF_TIMESTAMPING_OPT_ID:
   have multiple concurrent timestamping requests outstanding. Packets
   can be reordered in the transmit path, for instance in the packet
   scheduler. In that case timestamps will be queued onto the error
-  queue out of order from the original send() calls. This option
-  embeds a counter that is incremented at send() time, to order
-  timestamps within a flow.
+  queue out of order from the original send() calls. It is not always
+  possible to uniquely match timestamps to the original send() calls
+  based on timestamp order or payload inspection alone, then.
+
+  This option associates each packet at send() with a unique
+  identifier and returns that along with the timestamp. The identifier
+  is derived from a per-socket u32 counter (that wraps). For datagram
+  sockets, the counter increments with each sent packet. For stream
+  sockets, it increments with every byte.
+
+  The counter starts at zero. It is initialized the first time that
+  the socket option is enabled. It is reset each time the option is
+  enabled after having been disabled. Resetting the counter does not
+  change the identifiers of existing packets in the system.
 
   This option is implemented only for transmit timestamps. There, the
   timestamp is always looped along with a struct sock_extended_err.
-  The option modifies field ee_info to pass an id that is unique
+  The option modifies field ee_data to pass an id that is unique
   among all possibly concurrently outstanding timestamp requests for
-  that socket. In practice, it is a monotonically increasing u32
-  (that wraps).
+  that socket.
+
+
+SOF_TIMESTAMPING_OPT_CMSG:
 
-  In datagram sockets, the counter increments on each send call. In
-  stream sockets, it increments with every byte.
+  Support recv() cmsg for all timestamped packets. Control messages
+  are already supported unconditionally on all packets with receive
+  timestamps and on IPv6 packets with transmit timestamp. This option
+  extends them to IPv4 packets with transmit timestamp. One use case
+  is to correlate packets with their egress device, by enabling socket
+  option IP_PKTINFO simultaneously.
 
 
 1.4 Bytestream Timestamps
diff --git a/Documentation/networking/timestamping/txtimestamp.c b/Documentation/networking/timestamping/txtimestamp.c
index b32fc2a07734..876f71c5625a 100644
--- a/Documentation/networking/timestamping/txtimestamp.c
+++ b/Documentation/networking/timestamping/txtimestamp.c
@@ -46,6 +46,7 @@
 #include <netpacket/packet.h>
 #include <poll.h>
 #include <stdarg.h>
+#include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -58,6 +59,14 @@
 #include <time.h>
 #include <unistd.h>
 
+/* ugly hack to work around netinet/in.h and linux/ipv6.h conflicts */
+#ifndef in6_pktinfo
+struct in6_pktinfo {
+	struct in6_addr	ipi6_addr;
+	int		ipi6_ifindex;
+};
+#endif
+
 /* command line parameters */
 static int cfg_proto = SOCK_STREAM;
 static int cfg_ipproto = IPPROTO_TCP;
@@ -65,6 +74,8 @@ static int cfg_num_pkts = 4;
 static int do_ipv4 = 1;
 static int do_ipv6 = 1;
 static int cfg_payload_len = 10;
+static bool cfg_show_payload;
+static bool cfg_do_pktinfo;
 static uint16_t dest_port = 9000;
 
 static struct sockaddr_in daddr;
@@ -131,6 +142,30 @@ static void print_timestamp(struct scm_timestamping *tss, int tstype,
 	__print_timestamp(tsname, &tss->ts[0], tskey, payload_len);
 }
 
+/* TODO: convert to check_and_print payload once API is stable */
+static void print_payload(char *data, int len)
+{
+	int i;
+
+	if (len > 70)
+		len = 70;
+
+	fprintf(stderr, "payload: ");
+	for (i = 0; i < len; i++)
+		fprintf(stderr, "%02hhx ", data[i]);
+	fprintf(stderr, "\n");
+}
+
+static void print_pktinfo(int family, int ifindex, void *saddr, void *daddr)
+{
+	char sa[INET6_ADDRSTRLEN], da[INET6_ADDRSTRLEN];
+
+	fprintf(stderr, "         pktinfo: ifindex=%u src=%s dst=%s\n",
+		ifindex,
+		saddr ? inet_ntop(family, saddr, sa, sizeof(sa)) : "unknown",
+		daddr ? inet_ntop(family, daddr, da, sizeof(da)) : "unknown");
+}
+
 static void __poll(int fd)
 {
 	struct pollfd pollfd;
@@ -156,10 +191,9 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len)
 		    cm->cmsg_type == SCM_TIMESTAMPING) {
 			tss = (void *) CMSG_DATA(cm);
 		} else if ((cm->cmsg_level == SOL_IP &&
-		     cm->cmsg_type == IP_RECVERR) ||
-		    (cm->cmsg_level == SOL_IPV6 &&
-		     cm->cmsg_type == IPV6_RECVERR)) {
-
+			    cm->cmsg_type == IP_RECVERR) ||
+			   (cm->cmsg_level == SOL_IPV6 &&
+			    cm->cmsg_type == IPV6_RECVERR)) {
 			serr = (void *) CMSG_DATA(cm);
 			if (serr->ee_errno != ENOMSG ||
 			    serr->ee_origin != SO_EE_ORIGIN_TIMESTAMPING) {
@@ -168,6 +202,16 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len)
 						serr->ee_origin);
 				serr = NULL;
 			}
+		} else if (cm->cmsg_level == SOL_IP &&
+			   cm->cmsg_type == IP_PKTINFO) {
+			struct in_pktinfo *info = (void *) CMSG_DATA(cm);
+			print_pktinfo(AF_INET, info->ipi_ifindex,
+				      &info->ipi_spec_dst, &info->ipi_addr);
+		} else if (cm->cmsg_level == SOL_IPV6 &&
+			   cm->cmsg_type == IPV6_PKTINFO) {
+			struct in6_pktinfo *info6 = (void *) CMSG_DATA(cm);
+			print_pktinfo(AF_INET6, info6->ipi6_ifindex,
+				      NULL, &info6->ipi6_addr);
 		} else
 			fprintf(stderr, "unknown cmsg %d,%d\n",
 					cm->cmsg_level, cm->cmsg_type);
@@ -206,7 +250,11 @@ static int recv_errmsg(int fd)
 	if (ret == -1 && errno != EAGAIN)
 		error(1, errno, "recvmsg");
 
-	__recv_errmsg_cmsg(&msg, ret);
+	if (ret > 0) {
+		__recv_errmsg_cmsg(&msg, ret);
+		if (cfg_show_payload)
+			print_payload(data, cfg_payload_len);
+	}
 
 	free(data);
 	return ret == -1;
@@ -215,9 +263,9 @@ static int recv_errmsg(int fd)
 static void do_test(int family, unsigned int opt)
 {
 	char *buf;
-	int fd, i, val, total_len;
+	int fd, i, val = 1, total_len;
 
-	if (family == IPPROTO_IPV6 && cfg_proto != SOCK_STREAM) {
+	if (family == AF_INET6 && cfg_proto != SOCK_STREAM) {
 		/* due to lack of checksum generation code */
 		fprintf(stderr, "test: skipping datagram over IPv6\n");
 		return;
@@ -239,7 +287,6 @@ static void do_test(int family, unsigned int opt)
 		error(1, errno, "socket");
 
 	if (cfg_proto == SOCK_STREAM) {
-		val = 1;
 		if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
 			       (char*) &val, sizeof(val)))
 			error(1, 0, "setsockopt no nagle");
@@ -253,7 +300,20 @@ static void do_test(int family, unsigned int opt)
 		}
 	}
 
+	if (cfg_do_pktinfo) {
+		if (family == AF_INET6) {
+			if (setsockopt(fd, SOL_IPV6, IPV6_RECVPKTINFO,
+				       &val, sizeof(val)))
+				error(1, errno, "setsockopt pktinfo ipv6");
+		} else {
+			if (setsockopt(fd, SOL_IP, IP_PKTINFO,
+				       &val, sizeof(val)))
+				error(1, errno, "setsockopt pktinfo ipv4");
+		}
+	}
+
 	opt |= SOF_TIMESTAMPING_SOFTWARE |
+	       SOF_TIMESTAMPING_OPT_CMSG |
 	       SOF_TIMESTAMPING_OPT_ID;
 	if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
 		       (char *) &opt, sizeof(opt)))
@@ -262,8 +322,6 @@ static void do_test(int family, unsigned int opt)
 	for (i = 0; i < cfg_num_pkts; i++) {
 		memset(&ts_prev, 0, sizeof(ts_prev));
 		memset(buf, 'a' + i, total_len);
-		buf[total_len - 2] = '\n';
-		buf[total_len - 1] = '\0';
 
 		if (cfg_proto == SOCK_RAW) {
 			struct udphdr *udph;
@@ -324,11 +382,13 @@ static void __attribute__((noreturn)) usage(const char *filepath)
 			"  -4:   only IPv4\n"
 			"  -6:   only IPv6\n"
 			"  -h:   show this message\n"
+			"  -I:   request PKTINFO\n"
 			"  -l N: send N bytes at a time\n"
 			"  -r:   use raw\n"
 			"  -R:   use raw (IP_HDRINCL)\n"
 			"  -p N: connect to port N\n"
-			"  -u:   use udp\n",
+			"  -u:   use udp\n"
+			"  -x:   show payload (up to 70 bytes)\n",
 			filepath);
 	exit(1);
 }
@@ -338,7 +398,7 @@ static void parse_opt(int argc, char **argv)
 	int proto_count = 0;
 	char c;
 
-	while ((c = getopt(argc, argv, "46hl:p:rRu")) != -1) {
+	while ((c = getopt(argc, argv, "46hIl:p:rRux")) != -1) {
 		switch (c) {
 		case '4':
 			do_ipv6 = 0;
@@ -346,6 +406,9 @@ static void parse_opt(int argc, char **argv)
 		case '6':
 			do_ipv4 = 0;
 			break;
+		case 'I':
+			cfg_do_pktinfo = true;
+			break;
 		case 'r':
 			proto_count++;
 			cfg_proto = SOCK_RAW;
@@ -367,6 +430,9 @@ static void parse_opt(int argc, char **argv)
 		case 'p':
 			dest_port = strtoul(optarg, NULL, 10);
 			break;
+		case 'x':
+			cfg_show_payload = true;
+			break;
 		case 'h':
 		default:
 			usage(argv[0]);