Being curious as to how well LumberMill scales with more CPUs, I did a quick and simple benchmark. And while I was at it, I also included performance data for running LumberMill with Python-2.6, pypy-2.5.1 and the new pypy-4.0. As with all benchmarks, results needs to be taken with a pillar of salt ;)
The machine used was a x86_64 CentOS-6.5 vm image with 1GB RAM and 3 cores. The host was a Core i3 iMac/3.06GHz.
The logstash data is only included as a reference for me. The generator.threads setting was set to the number of filter workers started for each run.
Here are the results:
The LumberMill configuration used:
# Sets number of parallel LumberMill processes.
- Global:
workers: 3
- Spam:
event: '<13>Feb 11 13:01:36 john nginx: test 1392123696.090 localhost GET "/item/get?Template=Mobile_FacetedBrowsing_Bereich_DE&RowCount=24&Query=Binaries_vorderansicht:%3A%5B0+TO+A%5D&Sort=Preis_ASC&StartAtRowNo=265&Bereich=%22Skate%22&Oberkategorie=%22Longboards%22&Kundeninfo=%22Lieferbar%22coming+soon%22" "" - 200 98198 403 0.002 127.0.0.1 57679 "-" "/index.php" "curl/7.15.5 libcurl/7.15.5 OpenSSL/0.9.8b zlib/1.2.3 server/localhost" HIT "-" "-" "application/xml" "deflate, gzip" "gzip" "client-user-agent\x02Mozilla/5.0 (Linux; U; Android 4.1.2; de-de; GT-I8190 Build/JZO54K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30\x01client-addr\x02127.0.0.1,13.13.13.25\x01server-name\x02localhost"'
#events_count: 105
- RegexParser:
source_field: data
hot_rules_first: True
field_extraction_patterns:
- test_access_log: '(?P<syslog_prival>\<\d+\>)(?P<log_timestamp>\w+\s+\d+\s+\d+:\d+:\d+)\s+(?P<host>[\w\._\-]+)\s+nginx: test (?P<request_ts>\d+\.\d+) (?P<server_name>[\w\._\-]+) (?P<http_method>[\w\-]+) \"(?P<uri>[^\"]*)\" \"(?P<request_body>[^\"]*)\" (?P<username>[\w\-]+) (?P<http_status>\d+) (?P<bytes_sent>\d+) (?P<request_size>\d+) (?P<request_time>[\d\.]+) (?P<remote_ip>[\d\.]+) (?P<remote_port>\d+) \"(?P<x_forwarded_for>[^\"]*)\" \"(?P<referer>[^\"]*)\" \"(?P<user_agent>[^\"]*)\" (?P<cache_status>[\w\-]+) \"(?P<upstream_response_time>[^\"]*)\" \"(?P<upstream_addr>[^\"]*)\" \"(?P<content_type>[^\"]*)\" \"(?P<accept_encoding>[^\"]*)\" \"(?P<content_encoding>[^\"]*)\" \"(?P<custom_vars>[^\"]*)\"'
# Print out some stats every 10 seconds.
- SimpleStats:
interval: 10
- AddDateTime:
format: '%Y-%m-%dT%H:%M:%S.%f'
target_field: "@timestamp"
# Add geo info based on the lookup_fields. The first field in <source_fields> that yields a result from geoip will be used.
- AddGeoInfo:
geoip_dat_path: /usr/share/GeoIP/GeoLiteCity.dat
source_fields: [x_forwarded_for, remote_ip]
geo_info_fields: ['latitude', 'longitude', 'country_code']
# Nginx logs request time in seconds with milliseconds as float. Apache logs microseconds as int.
# At least cast nginx to integer.
- Math:
filter: if $(server_type) == "nginx"
target_field: request_time
function: float($(request_time)) * 1000
- ModifyFields:
action: cast_to_int
source_fields: [http_status, bytes_sent, request_size, request_time, remote_port, request_ts,]
- ModifyFields:
action: cast_to_float
source_fields: [upstream_response_time, latitude, longitude]
# Replace custom_vars Separators.
- ModifyFields:
action: string_replace
source_field: custom_vars
old: '\x01'
new: '¶'
# Replace custom_vars Separators.
- ModifyFields:
action: string_replace
source_field: custom_vars
old: '\x02'
new: '='
# Split custom vars to map.
- ModifyFields:
action: key_value
line_separator: '¶'
kv_separator: '='
source_field: custom_vars
- ModifyFields:
filter: if $(http_status)
action: map
source_field: http_status
map: {100: 'Continue', 200: 'OK', 301: 'Moved Permanently', 302: 'Found', 304: 'Not Modified', 400: 'Bad Request', 401: 'Unauthorized', 403: 'Forbidden', 404: 'Not Found', 500: 'Internal Server Error', 502: 'Bad Gateway'}
# Kibana’s ‘bettermap’ panel needs an array of floats in order to plot events on map.
- ModifyFields:
filter: if $(latitude)
action: merge
source_fields: [longitude, latitude]
target_field: geoip
- UserAgentParser:
source_fields: user_agent
- DevNullSink
The logstash configuration:
input {
generator {
threads => 3
lines => ['<13>Feb 11 13:01:36 john nginx: test 1392123696.090 localhost GET "/item/get?Template=Mobile_FacetedBrowsing_Bereich_DE&RowCount=24&Query=Binaries_vorderansicht:%3A%5B0+TO+A%5D&Sort=Preis_ASC&StartAtRowNo=265&Bereich=%22Skate%22&Oberkategorie=%22Longboards%22&Kundeninfo=%22Lieferbar%22coming+soon%22" "" - 200 98198 403 0.002 127.0.0.1 57679 "-" "/index.php" "curl/7.15.5 libcurl/7.15.5 OpenSSL/0.9.8b zlib/1.2.3 server/localhost" HIT "-" "-" "application/xml" "deflate, gzip" "gzip" "client-user-agent\x02Mozilla/5.0 (Linux; U; Android 4.1.2; de-de; GT-I8190 Build/JZO54K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30\x01client-addr\x02127.0.0.1,13.13.13.25\x01server-name\x02localhost"']
}
}
filter {
grok {
match => [ "message", '(?<syslog_prival>\<\d+\>)(?<log_timestamp>\w+\s+\d+\s+\d+:\d+:\d+)\s+(?<host>[\w\._\-]+)\s+nginx: test (?<request_ts>\d+\.\d+) (?<server_name>[\w\._\-]+) (?<http_method>[\w\-]+) \"(?<uri>[^\"]*)\" \"(?<request_body>[^\"]*)\" (?<username>[\w\-]+) (?<http_status>\d+) (?<bytes_sent>\d+) (?<request_size>\d+) (?<request_time>[\d\.]+) (?<remote_ip>[\d\.]+) (?<remote_port>\d+) \"(?<x_forwarded_for>[^\"]*)\" \"(?<referer>[^\"]*)\" \"(?<user_agent>[^\"]*)\" (?<cache_status>[\w\-]+) \"(?<upstream_response_time>[^\"]*)\" \"(?<upstream_addr>[^\"]*)\" \"(?<content_type>[^\"]*)\" \"(?<accept_encoding>[^\"]*)\" \"(?<content_encoding>[^\"]*)\" \"(?<custom_vars>[^\"]*)\"']
}
# Add geoinfo to custom vars.
geoip {
source => "[remote_ip]"
target => "geoinfo"
database => "/opt/logstash-1.4.2/vendor/geoip/GeoLiteCity.dat"
}
# Nginx logs request time in seconds with milliseconds as float. Apache logs microseconds as int.
ruby { code => "begin event['request_time'] = (event['request_time'] * 1000).ceil; rescue; end" }
mutate {
# Cast fields.
# Int: [http_status, bytes_sent, remote_port, request_time, request_size, TreeNodeID]
convert => [ "http_status", "integer" ]
convert => [ "bytes_sent", "integer" ]
convert => [ "request_size", "integer" ]
convert => [ "request_time", "float" ]
convert => [ "remote_port", "integer" ]
convert => [ "request_ts", "integer" ]
# Float: [upstream_response_time, latitude, longitude]
convert => [ "upstream_response_time", "float" ]
convert => [ "latitude", "float" ]
convert => [ "longitude", "float" ]
# CustomVars Separators
gsub => ["custom_vars", "\\x01", '¶']
gsub => ["custom_vars", "\\x02", '=']
}
# Split custom_vars to map.
kv {
source => "custom_vars"
field_split => "¶"
target => custom_vars
}
translate {
field => "http_status"
destination => "http_status_mapped"
override => true
dictionary => [ '100', 'Continue',
'200', 'OK',
'301', 'Moved Permanently',
'302', 'Found',
'304', 'Not Modified',
'400', 'Bad Request',
'401', 'Unauthorized',
'403', 'Forbidden',
'404', 'Not Found',
'500', 'Internal Server Error',
'502', 'Bad Gateway']
}
if [custom_vars][client-user-agent] {
useragent {
source => "[custom_vars][client-user-agent]"
target => "custom_vars"
prefix => "useragent_"
}
}
metrics {
meter => "events"
add_tag => "metric"
}
}
output {
if 'metric' in [tags] {
stdout {
codec => rubydebug
}
}
}