Tuesday, June 25, 2013

Large flow detection script

Large flow detection describes how sFlow monitoring scales to rapidly detect large flows (flows consuming more than 10% of a link's bandwidth). The chart displays the test pattern developed in the article and shows visually how sFlow can be used to detect and track large flows

This article develops a script for automatically detecting large flows that can form the basis of a load balancing performance aware software defined networking (SDN) application. The script is based on the node.js DDoS mitigation example presented in Controlling large flows with OpenFlow. Here the script is adapted poll sFlow-RT every second link loads as well as receiving large flow events.
var fs = require('fs');
var http = require('http');

var rt = { hostname: 'localhost', port: 8008 };
var flowkeys = 'ipsource,ipdestination,udpsourceport,udpdestinationport';
var threshold = 125000; // 1 Mbit/s   = 10% of link bandwidth

var links = {};

// mininet mapping between sFlow ifIndex numbers and switch/port names
var ifindexToPort = {};
var path = '/sys/devices/virtual/net/';
var devs = fs.readdirSync(path);
for(var i = 0; i < devs.length; i++) {
 var dev = devs[i];
 var parts = dev.match(/(.*)-(.*)/);
 if(!parts) continue;

 var ifindex = fs.readFileSync(path + dev + '/ifindex');
 ifindex = parseInt(ifindex).toString();
 var port = {'switch':parts[1],'port':dev};
 ifindexToPort[ifindex] = port;
}

function extend(destination, source) {
 for (var property in source) {
  if (source.hasOwnProperty(property)) {
   destination[property] = source[property];
  }
 }
 return destination;
}

function jsonGet(target,path,callback) {
 var options = extend({method:'GET',path:path},target);
 var req = http.request(options,function(resp) {
  var chunks = [];
  resp.on('data', function(chunk) { chunks.push(chunk); });
  resp.on('end', function() { callback(JSON.parse(chunks.join(''))); });
  });
  req.end();
};

function jsonPut(target,path,value,callback) {
 var options = extend({method:'PUT',headers:{'content-type':'application/json'},path:path},target);
 var req = http.request(options,function(resp) {
  var chunks = [];
  resp.on('data', function(chunk) { chunks.push(chunk); });
  resp.on('end', function() { callback(chunks.join('')); });
 });
 req.write(JSON.stringify(value));
 req.end();
};

function getLinkRecord(agent,ifindex) {
 var linkkey = agent + ">" + ifindex;
 var rec = links[linkkey];
 if(!rec) {
  rec = {agent:agent, ifindex:ifindex, port:ifindexToPort[ifindex]};
  links[linkkey] = rec;
 }
 return rec;
}

function updateLinkLoads(metrics) {  
 for(var i = 0; i < metrics.length; i++) {
  var metric = metrics[i];
  var rec = getLinkRecord(metric.agent,metric.dsIndex);
  rec.total = metric.metricValue;
 }
}

function largeFlow(link,flowKey,now,dt) {
 console.log(now + " " + " " + dt + " " + link.port.port + " " + flowKey);
}

function getEvents(id) {
 jsonGet(rt,'/events/json?maxEvents=10&timeout=60&eventID='+ id,
  function(events) {
   var nextID = id;
   if(events.length > 0) {
    nextID = events[0].eventID;
    events.reverse();
    var now = (new Date()).getTime();
    for(var i = 0; i < events.length; i++) {
     var evt = events[i];
     var dt = now - evt.timestamp;
     if('detail' == evt.thresholdID
        && Math.abs(dt) < 5000) {
      var flowKey = evt.flowKey;
      var rec = getLinkRecord(evt.agent,evt.dataSource);
      largeFlow(rec,flowKey,now,dt);
     }
    }
   }
   getEvents(nextID);
  }
 );
}

function startMonitor() {
 getEvents(-1);
 setInterval(function() {
  jsonGet(rt,'/dump/ALL/total/json', updateLinkLoads)
 }, 1000);
}

function setTotalFlows() {
 jsonPut(rt,'/flow/total/json',
  {value:'bytes',filter:'outputifindex!=discard&direction=ingress', t:2},
  function() { setDetailedFlows(); }
 );
}

function setDetailedFlows() {
 jsonPut(rt,'/flow/detail/json',
  {
   keys:flowkeys,
   value:'bytes',filter:'outputifindex!=discard&direction=ingress',
   n:10,
   t:2
  },
  function() { setThreshold(); }
 );
}

function setThreshold() {
 jsonPut(rt,'/threshold/detail/json',
  {
   metric:'detail',
   value: threshold,
   byFlow: true
  },
  function() { startMonitor(); }
 );
}

function initialize() {
 setTotalFlows();
}

initialize();
Some notes on the script:
  • The script tracks total bytes per second on each link using packet sampling since this gives a low latency measurement of link utilization consistent with the flow measurements, see Measurement delay, counters vs. packet counters. The link load values are stored in the links hashmap so they are available when deciding if and how to re-reroute large flows.
  • The flow keys used in this example are ipsource,ipdestination,udpsourceport,udpdestinationport in order to detect the flows generated by iperf. However, any keys can be used and multiple flow definitions could be monitored concurrently.
  • The threshold is set so that any flow that consumes 10% or more of  link bandwidth is reported.
  • The function largeFlow() simply prints out the large flows. However, given a network topology and the link utilization data, this function could be used to implement flow steering controls.
The stepped test pattern is described in Large flow detection was used to evaluate the responsiveness of the detection script. The test pattern consisting of 20 second constant rate traffic flows ranging from 1Mbit/s to 10Mbit/s, representing 10% to 100% of link bandwidth. The test pattern script was modified to print out the time when each flow is started so that they could be compared with times reported by the flow detection script:
# bash sweep2.bash 
1372176102617
1372176132691
1372176162765
1372176192831
1372176222896
1372176252949
1372176283003
1372176313065
1372176343138
1372176373194
The results from the flow detection script are as follows:
$ nodejs lf.js
1372176107535  2 s3-eth1 10.0.0.1,10.0.0.3,57576,5001
1372176108583  5 s2-eth3 10.0.0.1,10.0.0.3,57576,5001
1372176109488  4 s1-eth1 10.0.0.1,10.0.0.3,57576,5001
1372176134246  4 s3-eth1 10.0.0.1,10.0.0.3,48214,5001
1372176134852  4 s2-eth3 10.0.0.1,10.0.0.3,48214,5001
1372176134974  4 s1-eth1 10.0.0.1,10.0.0.3,48214,5001
1372176163792  4 s2-eth3 10.0.0.1,10.0.0.3,39956,5001
1372176164018  5 s1-eth1 10.0.0.1,10.0.0.3,39956,5001
1372176164369  4 s3-eth1 10.0.0.1,10.0.0.3,39956,5001
1372176193584  2 s3-eth1 10.0.0.1,10.0.0.3,38970,5001
1372176193631  6 s2-eth3 10.0.0.1,10.0.0.3,38970,5001
1372176193980  5 s1-eth1 10.0.0.1,10.0.0.3,38970,5001
1372176223492  4 s3-eth1 10.0.0.1,10.0.0.3,44499,5001
1372176223517  3 s2-eth3 10.0.0.1,10.0.0.3,44499,5001
1372176223595  5 s1-eth1 10.0.0.1,10.0.0.3,44499,5001
1372176253274  4 s2-eth3 10.0.0.1,10.0.0.3,39900,5001
1372176253437  3 s3-eth1 10.0.0.1,10.0.0.3,39900,5001
1372176253557  5 s1-eth1 10.0.0.1,10.0.0.3,39900,5001
1372176283485  4 s2-eth3 10.0.0.1,10.0.0.3,55620,5001
1372176283496  9 s1-eth1 10.0.0.1,10.0.0.3,55620,5001
1372176283515  2 s3-eth1 10.0.0.1,10.0.0.3,55620,5001
1372176313469  3 s3-eth1 10.0.0.1,10.0.0.3,58151,5001
1372176313509  2 s1-eth1 10.0.0.1,10.0.0.3,58151,5001
1372176313556  3 s2-eth3 10.0.0.1,10.0.0.3,58151,5001
1372176343398  6 s1-eth1 10.0.0.1,10.0.0.3,41406,5001
1372176343490  4 s2-eth3 10.0.0.1,10.0.0.3,41406,5001
1372176343589  3 s3-eth1 10.0.0.1,10.0.0.3,41406,5001
1372176373525  5 s2-eth3 10.0.0.1,10.0.0.3,44200,5001
1372176373542  5 s3-eth1 10.0.0.1,10.0.0.3,44200,5001
1372176373650  6 s1-eth1 10.0.0.1,10.0.0.3,44200,5001
Note that each flow is detected on ingress by all of the links it traverses as it crosses the network (the blue path shown in the figure below).

Detecting each flow on all the links it traversed gives the controller end to end visibility and allows it to choose globally optimal paths. In this experiment, having multiple independent agents report on the flows provides useful data on the spread of detection times for each flow.

The following table summarizes Detection Time vs. Flow Size from this experiment:

Flow Size (% of link bandwidth)Detection Time
10%4.918 - 6.871 seconds
20%1.555 - 2.283 seconds
30%1.027 - 1.604 seconds
40%0.753 - 1.149 seconds
50%0.596 - 0.699 seconds
60%0.325 - 0.608 seconds
70%0.482 - 0.512 seconds
80%0.404 - 0.491 seconds
90%0.260 - 0.451 seconds
100%0.331 - 0.456 seconds

The relatively slow time to detect the 10% flow results because the threshold was set at 10% and so these flows are on the margin. If a lower threshold had been set, they would have been detected more quickly. For flow sizes larger than 10%, the detection are between 1 and 2 seconds for flows in the range of 20% - 40% of bandwidth and detection times for larger flows is consistently sub-second.

The detection times shown in the table are achievable with the following sampling rates, see Large flow detection:

Link SpeedLarge FlowSampling RatePolling Interval
10 Mbit/s>= 1 Mbit/s1-in-1020 seconds
100 Mbit/s>= 10 Mbit/s1-in-10020 seconds
1 Gbit/s>= 100 Mbit/s1-in-1,00020 seconds
10 Gbit/s>= 1 Gbit/s1-in-10,00020 seconds
40 Gbit/s>= 4 Gbit/s1-in-40,00020 seconds
100 Gbit/s>= 10 Gbit/s1-in-100,00020 seconds

These sampling rates allow a central controller to monitor very large scale switch fabrics. In addition, multiple control functions can be applied in parallel based on the sFlow data feed, see Software defined analytics. For example, implementing load balancing, mitigating denial of service attacks and capturing suspicious traffic as SDN applications.

9 comments:

  1. Hi, how do you calculate/generate the detection time for each flow size? Thanks

    ReplyDelete
    Replies
    1. The load generation script prints out a timestamp each time a flow is initiated and the flow detection script prints out a timestamp when a large flow threshold event is generated for each link across the network (3 events). The range of results is the spread of times for the first and last link to detect the large flow (obtained by subtraction the event and load generation timestamps).

      Delete
  2. This comment has been removed by the author.

    ReplyDelete
    Replies
    1. This comment has been removed by the author.

      Delete
  3. when I run this script, error is " require is not defined",, How can I fix it?

    ReplyDelete
    Replies
    1. The script in this article is run using node.js to access the sFlow-RT REST API. The error you described would be generated if you were trying to run the script using sFlow-RT's internal JavaScript engine.

      Delete
  4. can i get any tutorial to learn this code..
    thnk you.

    ReplyDelete
    Replies
    1. For node.js there are a number of suggestions on StackOverflow How do I get started with Node.js.

      For sFlow-RT, see Writing Applications

      Delete