sFlow: Alcatel-Lucent OmniSwitch analytics driven control

There are a many articles on this blog that demonstrate how real-time sFlow analytics driven control of switches using a Mininet testbed. This article is the first of a series that will shift the focus to physical switches and demonstrate different techniques for adapting network behavior to changing traffic.

Performance Aware SDN describes the theory behind analytics driven orchestration. The talk describes how fast controller response, programmatic configuration interfaces and consistent instrumentation of all the elements being orchestrated are pre-requisites for feedback control.

This article uses an Alcatel-Lucent OmniSwitch 6900 as an example. The switch has hardware sFlow support for line rate visibility on all ports, and support for OpenFlow and a RESTful configuration API to deploy control actions. In this example a basic DDoS mitigation filtering function will be triggered when large flood attacks are detected. The script is based on the version described in the article Integrated hybrid OpenFlow, but modified to use the OmniSwitch RESTful API.

RESTful control of switches describes how RESTFul configuration access to switches can be used to develop simple, controller-less SDN solutions. In this example the controller application is implemented using JavaScript that runs within the sFlow-RT analytics engine. The script has access to analytics data based on sFlow received from all the switches in the network and can directly access any switch using HTTP to make configuration changes. The script also provides a simple HTTP "Northbound API" that allows orchestration software to enable / disable the control function and manually add and remove controls.

include('extras/aluws.js');

var flowkeys = 'inputifindex,ipsource';
var value = 'frames';
var filter = 'direction=ingress&icmptype=8';
var threshold = 1000;

var metricName = 'ddos';
var controls = {};
var enabled = true;
var blockSeconds = 20;
var ruleid = 0;

var collectorIP = "10.0.0.162";
var collectorPort = 6343;

var agents = {
    '10.0.0.234':{user:'admin',password:'password',ports:'1/1-20',sampling:128, polling:20}
}

function initializeAgent(agent) {
    var rec = agents[agent];
    var server = new ALUServer(agent,rec.user,rec.password);
    rec.server = server;

    server.login();

    // configure sFlow
    server.runCmds([
      'sflow agent ip ' + agent,
      'sflow receiver 1 name InMon address '+collectorIP+' udp-port '+collectorPort,
      'sflow sampler 1 port '+rec.ports +' receiver 1 rate '+rec.sampling,
      'sflow poller 1 port '+rec.ports +' receiver 1 interval '+rec.polling
    ]);

    // get ifIndex to ifName mapping
    var res = server.rest('get','mib','ifXTable',{mibObject0:'ifName'});
    var rows = res.result.data.rows;
    var ifIndexToName = {};
    for(var ifIndex in rows) ifIndexToName[ifIndex] = rows[ifIndex].ifName;

    server.logout();

    agents[agent].ifIndexToName = ifIndexToName;
}

function block(agent,ip,port) {
    if(controls[ip]) return;

    var rec = agents[agent];
    if(!rec) return;

    var name = 'rt' + ruleid++;

    rec.server.login();

    rec.server.runCmds([
      'policy condition '+name+' source ip '+ip,
      'policy action '+name+' disposition drop',
      'policy rule '+name+' condition '+name+' action '+name,
      'qos apply'
    ]);

    rec.server.logout();

    controls[ip] = { 
 name: name, 
 agent:agent,
 action:'block', 
 time: (new Date()).getTime() 
    };
}

function allow(ip) {
    if(!controls[ip]) return;

    var ctl = controls[ip];
    var agent = ctl.agent;
    var rec = agents[agent];

    rec.server.login();

    rec.server.runCmds([
      'no policy rule '+ctl.name,
      'no policy action '+ctl.name,
      'no policy condition '+ctl.name,
      'qos apply'
   ]);

    rec.server.logout();

    delete controls[ip];
}

setEventHandler(function(evt) {
 if(!enabled) return;

 var agent = evt.agent;
 var parts = evt.flowKey.split(',');
 var ifindex = parts[0];
 var ipsource = parts[1];

 var rec = agents[agent];
 if(!rec) return;

 block(agent,ipsource,rec.ifIndexToName[ifindex]);
}, [metricName]);


setIntervalHandler(function() {
  // remove stale controls
  var stale = [];
  var now = (new Date()).getTime();
  var threshMs = 1000 * blockSeconds;
  for(var addr in controls) {
    if((now - controls[addr].time) > threshMs) stale.push(addr);
  }
  for(var i = 0; i < stale.length; i++) allow(stale[i]);
},10);


setHttpHandler(function(request) {
 var result = {};
 try {
     var action = '' + request.query.action;
     switch(action) {
     case 'block':
  var agent = request.query.agent[0];
  var address = request.query.address[0];
  var port = request.query.port[0];
  if(agent&&address&&port) block(agent,address,port);
  break;
     case 'allow':
  var address = request.query.address[0];
  if(address) allow(address);
  break;
     case 'enable':
  enabled = true;
  break;
     case 'disable':
  enabled = false;
  break;
     }
 }
 catch(e) { result.error = e.message }
 result.controls = controls;
 result.enabled = enabled;
 return JSON.stringify(result);
});

setFlow(metricName,{keys:flowkeys,value:value,filter:filter});
setThreshold(metricName,{metric:metricName,value:threshold,byFlow:true,timeout:10});

for(var agent in agents) {
    initializeAgent(agent);
}

The following command line argument loads the script on startup:

-D script.file=omniddos.js

Some notes on the script:

The included extras/aluws.js script defines the ALUServer() function which provides access to the OmniSwitch Web Services API
The filter looks for flows of ingress ICMP echo request packets - this is useful for the demo, but in practice filters would be constructed to look for attacks from external sources, targeting internal servers - see Performance aware software defined networking
The controls structure is used to keep track of state associated with deployed configuration changes so that they can be undone
The intervalHandler() function is used to automatically release controls after 20 seconds - the timeout is short for the purposes of demonstration, in practical deployments the timeout would be much measured in hours
The ifIndexToName mapping allows the ifIndex numbers reported by sFlow to be mapped to interface names in CLI commands
Additional switches and settings can be added to agents structure - hundreds of switches can be monitored and controlled by a single sFlow-RT instance.
The block() and allow() commands use filtering policy commands to implement controls that block traffic. The script can easily be modified to implement different policies (for example to rate limit or mark traffic), or in the case of large flood attacks, changing BGP settings to cause the upstream provider to drop traffic (e.g. Hurricane Electric Customer Blackhole Community)

To try out the script, use a web browser to view a trend of traffic and then perform the following steps:

disable the controller (http://10.0.0.162:8008/script/omniddos.js/json?action=disable)
perform a simulated DoS attack (using a flood ping)
enable the controller (http://10.0.0.162:8008/script/omniddos.js/json?action=enable)
simulate a second DoS attack

When the controller is disabled, the simulated attack traffic exceeds 3,000 packets per second and persists until the attacker stops sending. When the controller is enabled, traffic is blocked when it hits the 1,000 packet per second threshold in the application. The control is removed 20 seconds later and re-triggers if the attacker is still sending traffic.

DDoS mitigation is only one use case for large flow control, others described on this blog include: ECMP / LAG load balancing, traffic marking, blacklists, and packet capture. Scripts can be added to address these different use cases, as well as providing information on network health and server performance to operations teams (see Exporting events using syslog and Metric export to Graphite)

Thursday, January 9, 2014

Alcatel-Lucent OmniSwitch analytics driven control

No comments:

Post a Comment