Notebook on nbviewer (original) (raw)
Cleaning the data
import numpy as np import gzip
def row_generator(): """This will generate all the edges in the graph.""" with gzip.open(LIVEJOURNAL_FILENAME, 'rt') as f: for line in f: if line.startswith('#'): continue else: (left_node, right_node) = line.split('\t') yield(int(left_node), int(right_node))
def to_undirected(edge_iterable, num_edges, num_nodes, shuffle=True): """Takes an iterable of edges and produces the list of edges for the undirected graph.
> to_undirected([[0,1],[1,2],[2,10]], 3, 11)
array([[ 0, 1],
[ 1, 0],
[ 1, 2],
[ 2, 1],
[ 2, 10],
[10, 2]])
"""
# need int64 to do gross bithacks
as_array = np.zeros((num_edges, 2), dtype=np.int64)
for (i, (n_0, n_1)) in enumerate(edge_iterable):
as_array[i,0] = n_0
as_array[i,1] = n_1
# The graph is directed, but we want to make it undirected,
# which means we will duplicate some rows.
left_nodes = as_array[:,0]
right_nodes = as_array[:,1]
if shuffle:
the_shuffle = np.arange(num_nodes)
np.random.shuffle(the_shuffle)
left_nodes = the_shuffle.take(left_nodes)
right_nodes = the_shuffle.take(right_nodes)
# numpy.unique will not unique whole rows, so this little bit-hacking
# is a quick way to get unique rows after making a flipped copy of
# each edge.
max_bits = int(np.ceil(np.log2(num_nodes + 1)))
encoded_edges_forward = np.left_shift(left_nodes, max_bits) | right_nodes
# Flip the columns and do it again:
encoded_edges_reverse = np.left_shift(right_nodes, max_bits) | left_nodes
unique_encoded_edges = np.unique(np.hstack((encoded_edges_forward, encoded_edges_reverse)))
left_node_decoded = np.right_shift(unique_encoded_edges, max_bits)
# Mask out the high order bits
right_node_decoded = (2 ** (max_bits) - 1) & unique_encoded_edges
undirected_edges = np.vstack((left_node_decoded, right_node_decoded)).T.astype(np.int32)
# ascontiguousarray so that it's c-contiguous for cython code below
return np.ascontiguousarray(undirected_edges)
def get_clean_data(shuffle=True): if shuffle: name = os.path.join(DATA_DIRECTORY, 'LJ-cleaned-shuffled.npy') else: name = os.path.join(DATA_DIRECTORY, 'LJ-cleaned.npy')
if os.path.exists(name):
print('Loading from file {}'.format(name))
return np.load(name)
else:
print('Parsing from zip. Will write to file {}'.format(name), flush=True)
# Lets get the edges into one big array
edges = to_undirected(row_generator(), NUM_EDGES, NUM_NODES, shuffle=shuffle)
print('ORIGINAL DIST: {} MIN: {} MAX: {}'.format(np.abs(edges[:,0] - edges[:,1]).mean(), edges.min(), edges.max()))
np.save(name, edges)
return edges