I'm currently working on sorting netflow data in a json file based on end time. I'm placing all of this data into dictionaries in which keys are the end time (but only the hour and minute, so that multiple data values fall under one time). However, this is taking a bit long - not longer than a few second, but that's still too long. What's a good way to better the big O of this? What I'm doing right now is just going through the file line by line, and extracting the end times, and creating an empty dictionary (where the values are empty sets) and the keys are the hour/min of the end time. Then, I just go through the dictionary and add the lines that have the corresponding endtime to the given key to the value which is a set.
edit: Here is a sample of the kind of json data. The following is one line of it. The files I'm working with are close to 300,000 lines.
{
"@timestamp": "2015-05-18T19:26:08.000Z",
"netflow": {
"version": "9",
"flow_seq_num": "188185",
"flowset_id": "257",
"last_switched": "2015-05-15T14:28:02.999Z",
"first_switched": "2015-05-15T14:27:38.999Z",
"in_bytes": "71",
"in_pkts": "1",
"input_snmp": "5",
"output_snmp": "4",
"ipv4_src_addr": "192.1.44.133",
"ipv4_dst_addr": "10.10.1.4",
"protocol": "6",
"src_tos": "0",
"dst_tos": "2",
"l4_src_port": "12373",
"l4_dst_port": "80",
"flow_sampler_id": "0",
"ipv4_next_hop": "10.10.1.5",
"dst_mask": "2",
"src_mask": "31",
"tcp_flags": "6",
"direction": "0"
},
"@version": "1",
"host": "192.168.19.202",
"src_host_name": "",
"dst_host_name": "",
"app_name": "",
"tcp_flags_str": "",
"dscp": "",
"highval": "",
"src_blacklisted": "0",
"dst_blacklisted": "0",
"invalid_ToS": "0",
"bytes_per_packet": 71,
"tcp_nominal_payload": "0",
"malformed_ip": "0",
"empty_tcp": "0",
"short_tcp_handshake": "0",
"icmp_malformed_packets": "0",
"snort_attack_flow": "0",
"empty_udp": "0",
"short_udp": "0",
"short_tcp_rstack": "0",
"short_tcp_pansf": "0",
"short_tcp_synack": "0",
"short_tcp_synrst": "0",
"short_tcp_finack": "0",
"short_tcp_pna": "0",
"non_unicast_src": "0",
"multicast": "0",
"broadcast": "0",
"network": "0",
"tcp_urg": "0",
"land_attack": "0",
"short_tcp_ack": "0",
"tcp_synfin": "0",
"tcp_fin": "0",
"malformed_tcp": "1",
"tcp_xmas": "0",
"udp_echo_req": "0",
"tcp_null": "0",
"tcp_syn": "0",
"malformed_udp": "0",
"tcp_rst": "0",
"icmp_request": "0",
"icmp_response": "0",
"icmp_port_unreachable": "0",
"icmp_host_unreachable": "0",
"icmp_unreachable_for_Tos": "0",
"icmp_network_unreachable": "0",
"icmp_redirects": "0",
"icmp_time_exceeded_flows": "0",
"icmp_parameter_problem_flows": "0",
"icmp_trace_route": "0",
"icmp_datagram": "0",
"udp_echo_chargen_broadcast": "0",
"udp_chargen_echo_broadcast": "0",
"icmp_src_quench": "0",
"icmp_proto_unreachable": "0",
"udp_echo_broadcast": "0",
"udp_echo_rsp": "0"
}
As for code I have tried, currently I'm just converting these lines into dictionaries to access the different values I'm looking to sort by. It's really simple, I'm just using json.loads and such to create dictionaries. What kind of data structure is best for organizing this kind of thing? I'm using a dictionary for now, but is there a better one?