attachment:DataFrame.py of Cookbook/DataFrame - SciPy wiki dump (original) (raw)

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 import math 32 import sets 33 from scipy import * 34 import Numeric 35 import cStringIO as StringIO 36 import csv 37 class access2000(csv.Dialect): 38 """A dialect to properly interpret Microsoft Access2000 CSV exports for international languages. 39 """ 40 delimiter = ';' 41 quotechar = '"' 42 doublequote = True 43 quoting = csv.QUOTE_NONNUMERIC 44 lineterminator = '\n' 45 skipinitialspace = True 46 47 class DataFrame: 48 def init(self, value_dict=None, fields_order=None ): 49 if value_dict is None: 50 value_dict = {} 51 self.value_dict = value_dict 52 num_rows = 0 53 for column in self.value_dict.values(): 54 try: 55 num_rows = max(num_rows,len(column)) 56 except: 57 pass 58 for key in self.value_dict.keys(): 59 if self.value_dict[key] is None: 60 self.value_dict[key] = (None,)*num_rows 61 for field in self.value_dict: 62 if len(self.value_dict[field]) != num_rows: 63 raise ValueError("field %s has wrong number of rows"%str(field)) 64 self.num_rows = num_rows 65 if fields_order is None: 66 self.fields_order = self.value_dict.keys() 67 else: 68 for key in fields_order: 69 assert self.value_dict.has_key(key) 70 self.fields_order = fields_order 71 72 def copy_empty(self): 73 vd = {} 74 for k in self.value_dict.keys(): 75 vd[k] = None 76 return DataFrame(vd,self.fields_order) 77 78 def add(self,other): 79 nd = {} 80 for k in other.fields_order: 81 if k not in self.fields_order: 82 raise NotImplementedError("no fix yet for when not all fields are in both frames") 83 for k in self.fields_order: 84 nd[k] = list(self.value_dict[k]) + list(other.value_dict[k]) 85 res = DataFrame(nd,self.fields_order) 86 return res 87 88 def insert_row(self, value_dict, new_fields_ok=False): 89 if not new_fields_ok: 90 for v in value_dict: 91 assert(v in self.fields_order) 92 for v in self.fields_order: 93 assert(v in value_dict) 94 try: 95 self.value_dict[v].append( value_dict[v] ) 96 except AttributeError: 97 tmp = list(self.value_dict[v]) 98 tmp.append( value_dict[v] ) 99 self.value_dict[v] = tmp 100 self.num_rows += 1 101 else: 102 all_fields = list(sets.Set(self.fields_order).union(value_dict.keys())) 103 all_fields.sort() 104 all_fields.reverse() 105 for v in all_fields: 106 if v in value_dict: 107 if v not in self.value_dict: 108 self.value_dict[v] = [None]*self.num_rows 109 try: 110 self.value_dict[v].append(value_dict[v]) 111 except AttributeError: 112 tmp = list(self.value_dict[v]) 113 tmp.append( value_dict[v] ) 114 self.value_dict[v] = tmp 115 else: 116 try: 117 self.value_dict[v].append(None) 118 except AttributeError: 119 tmp = list(self.value_dict[v]) 120 tmp.append( None ) 121 self.value_dict[v] = tmp 122 self.fields_order = all_fields 123 self.num_rows += 1 124 125 def insert_column(self, field_name, values, position='last'): 126 assert len(values) == self.num_rows 127 if position == 'last': 128 self.fields_order.append(field_name) 129 else: 130 self.fields_order.insert(position,field_name) 131 self.value_dict[field_name] = values 132 133 def drop_column(self, field_name): 134 self.fields_order.remove(field_name) 135 del self.value_dict[field_name] 136 137 def drop_all_columns_except(self, *field_names): 138 save_names = list(field_names) 139 for field_name in self.fields_order[:]: 140 if field_name not in save_names: 141 self.drop_column( field_name ) 142 143 def str(self): 144 def cc(s,width=10,just='center'): 145 if len(s) > width: 146 s = s[:width] 147 if just=='center': 148 return s.center(width) 149 elif just=='left': 150 return s.ljust(width) 151 elif just=='right': 152 return s.rjust(width) 153 buf = StringIO.StringIO() 154 print >> buf, cc('',width=5,just='right'), 155 for field in self.fields_order: 156 print >> buf, cc( field), 157 print >> buf 158 for row in range(self.num_rows): 159 print >> buf, cc(str(row),width=5,just='right'), 160 for field in self.fields_order: 161 v = self.value_dict[field][row] 162 if v is not None: 163 v_str = str(v) 164 else: 165 v_str = '' 166 print >> buf, cc( v_str ), 167 print >> buf 168 buf.seek(0) 169 return buf.read() 170 171 def get_row_dict(self,row_idx): 172 return self[row_idx] 173 174 def getitem(self, i): 175 result = {} 176 for field in self.fields_order: 177 result[field] = self.value_dict[field][i] 178 return result 179 180 def getitems(self, idxs): 181 result = [] 182 for i in idxs: 183 result.append( self[i] ) 184 return result 185 186 def len(self): 187 return self.num_rows 188 189 def __get_row(self,row_idx): 190 result = [] 191 for field in self.fields_order: 192 result.append( self.value_dict[field][row_idx] ) 193 return result 194 195 def __get_rows(self,row_idxs): 196 if len(row_idxs) == 0: 197 return 198 rows = [self.__get_row(row_idx) for row_idx in row_idxs] 199 by_col = zip(*rows) 200 result = {} 201 for i,field in enumerate(self.fields_order): 202 result[field] = by_col[i] 203 return result 204 205 def where_field_cmp(self, field, bool_func_of_value): 206 col = self.value_dict[field] 207 indices = [] 208 for i in range(len(col)): 209 if bool_func_of_value(col[i]): 210 indices.append(i) 211 results = self.__get_rows(indices) 212 if results is not None: 213 return DataFrame(results,fields_order=self.fields_order) 214 else: 215 return None 216 217 def where_field_equal(self, field, value, eps=None): 218 col = self.value_dict[field] 219 indices = [] 220 if eps is None: 221 for i in range(len(col)): 222 if col[i] == value: 223 indices.append(i) 224 else: 225
226 a = asarray(col) 227 abs_diff = abs(a-value) 228 indices = nonzero( less( abs_diff, eps ) ) 229 results = self.__get_rows(indices) 230 if results is not None: 231 return DataFrame(results,fields_order=self.fields_order) 232 else: 233 return None 234 235 def where_field_not_equal(self, field, value, eps=None): 236 col = self.value_dict[field] 237 indices = [] 238 if eps is None: 239 for i in range(len(col)): 240 if col[i] != value: 241 indices.append(i) 242 else: 243
244 a = numpy.asarray(col) 245 abs_diff = abs(a-value) 246 indices = numpy.nonzero( numpy.greater_equal( abs_diff, eps ) ) 247 results = self.__get_rows(indices) 248 if results is not None: 249 return DataFrame(results,fields_order=self.fields_order) 250 else: 251 return None 252 253 def where_field_less(self, field, value): 254 col = self.value_dict[field] 255 indices = [] 256 for i in range(len(col)): 257 if col[i] < value: 258 indices.append(i) 259 results = self.__get_rows(indices) 260 if results is not None: 261 return DataFrame(results,fields_order=self.fields_order) 262 else: 263 return None 264 265 def where_field_lessequal(self, field, value): 266 col = self.value_dict[field] 267 indices = [] 268 for i in range(len(col)): 269 if col[i] <= value: 270 indices.append(i) 271 results = self.__get_rows(indices) 272 if results is not None: 273 return DataFrame(results,fields_order=self.fields_order) 274 else: 275 return None 276 277 def where_field_greater(self, field, value): 278 col = self.value_dict[field] 279 indices = [] 280 for i in range(len(col)): 281 if col[i] > value: 282 indices.append(i) 283 results = self.__get_rows(indices) 284 if results is not None: 285 return DataFrame(results,fields_order=self.fields_order) 286 else: 287 return None 288 289 def where_field_greaterequal(self, field, value): 290 col = self.value_dict[field] 291 indices = [] 292 for i in range(len(col)): 293 if col[i] >= value: 294 indices.append(i) 295 results = self.__get_rows(indices) 296 if results is not None: 297 return DataFrame(results,fields_order=self.fields_order) 298 else: 299 return None 300 301 def where_field_in(self, field, values): 302 col = self.value_dict[field] 303 indices = [] 304 for i in range(len(col)): 305 if col[i] in values: 306 indices.append(i) 307 results = self.__get_rows(indices) 308 if results is not None: 309 return DataFrame(results,fields_order=self.fields_order) 310 else: 311 return None 312 313 def enumerate_on(self, field, cmp_func=None): 314 values = self.get_unique_values(field) 315 result_frames = [] 316 values.sort(cmp_func) 317 for value in values: 318 result_frames.append( (value, self.where_field_equal(field,value))) 319 return iter(result_frames) 320 321 def enumerate_crude_bins(self, field, eps=None, eps_domain='linear'): 322 if eps is None: 323 return self.enumerate_on(field) 324 if eps_domain == 'linear': 325 def filt(x): 326 return x 327 elif eps_domain == 'log10': 328 def filt(x): 329 return numpy.log10(x) 330 else: 331 raise NotImplementedError 332 vs = self.get_unique_values(field) 333 vs.sort() 334 bins = {} 335 current_starter_v = None 336 for cv in vs: 337 if current_starter_v is not None and abs(filt(cv)-filt(current_starter_v))<eps: 338 bins[current_starter_v].append(cv) 339 else: 340 bins.setdefault(cv,[]).append(cv) 341 current_starter_v = cv 342 results = [] 343 keys = bins.keys() 344 keys.sort() 345 for close_v in keys: 346 running_sum = 0 347 running_n = 0 348 accum = self.copy_empty() 349 for v in bins[close_v]: 350 add_frame = self.where_field_equal(field,v) 351 n = add_frame.num_rows 352 running_sum += (v*n) 353 running_n += n 354 accum = accum + add_frame 355 avg_value = running_sum/float(running_n) 356 results.append(( avg_value, accum )) 357 return iter(results) 358 359 def enumerate_crude_2_dims(self, field1, field2, 360 eps1=None, eps2=None, 361 eps1_domain='linear', eps2_domain='linear'): 362 axis1_vs = {} 363 for v,vf in self.enumerate_crude_bins(field1,eps=eps1,eps_domain=eps1_domain): 364 axis1_vs[v] = vf.get_unique_values(field1) 365 366 axis2_vs = {} 367 for v,vf in self.enumerate_crude_bins(field2,eps=eps2,eps_domain=eps2_domain): 368 axis2_vs[v] = vf.get_unique_values(field2) 369 370 results = [] 371 372 v1s = axis1_vs.keys() 373 v1s.sort() 374 375 v2s = axis2_vs.keys() 376 v2s.sort() 377 for v1 in v1s: 378 for v2 in v2s: 379 this_result = self.copy_empty() 380 for v1r in axis1_vs[v1]: 381 for v2r in axis2_vs[v2]: 382 tmp1 = self.where_field_equal(field1,v1r) 383 if tmp1 is not None: 384 tmp2 = tmp1.where_field_equal(field2,v2r) 385 if tmp2 is not None: 386 this_result = this_result + tmp2 387 if this_result.num_rows > 0: 388 results.append((v1,v2,this_result)) 389 return iter(results) 390 391 def mean(self, field): 392 return mean(self.value_dict[field]) 393 394 def mean_and_std(self, field): 395 values = self.value_dict[field] 396 return mean(values), std(values) 397 398 def mean_and_sem(self, field): 399 values = self.value_dict[field] 400 n = len(values) 401 return mean(values), std(values)/float(numpy.sqrt(n)) 402 403 def get_fields(self): 404 return self.fields_order[:] 405 406 def sort_by(self,field,ascending=True): 407 orig = self.value_dict[field] 408 sorted = list(orig[:]) 409 sorted.sort() 410 if not ascending: 411 sorted.reverse() 412 my_copy = list(orig[:]) 413 new_order = [] 414 for i in range(len(sorted)): 415 sorted_value = sorted[i] 416 index = my_copy.index( sorted_value ) 417 new_order.append( index ) 418 my_copy[index] = int 419 for field in self.fields_order: 420 orig_list = self.value_dict[field] 421 new_list = [] 422 for i in new_order: 423 new_list.append( orig_list[i] ) 424 self.value_dict[field] = new_list 425 426 def sorted(self,field,ascending=True): 427 result = DataFrame(self.value_dict, self.fields_order) 428 result.sort_by(field,ascending=ascending) 429 return result 430 431 def get_unique_values(self, field): 432 unique_values = list(sets.Set(self.get_all_values(field))) 433 try: 434 unique_values.sort() 435 except: 436 pass 437 return unique_values 438 439 def get_all_values(self, field): 440 values = [ v for v in self.value_dict[field] ] 441 return values 442 443 def write_csv(self,filename,dialect='excel'): 444 writer = csv.writer(open(filename,'w'),dialect=dialect) 445 writer.writerow( self.fields_order ) 446 for i in range(self.num_rows): 447 writer.writerow( self.__get_row(i) ) 448 449 def read_csv(file,header=True,dialect='excel'): 450 if not hasattr(file,'readlines'): 451
452 reader = csv.reader(open(file,"r"),dialect=dialect) 453 else: 454 reader = csv.reader(file,dialect=dialect) 455 split_lines = [ row for row in reader ] 456 if header: 457 fields = split_lines.pop(0) 458 else: 459 num_fields = len(split_lines[0]) 460 num_chars = int(math.ceil(math.log10(num_fields))) 461 name_string = "column%0"+str(num_chars)+"d" 462 fields = [ name_string%(i+1) for i in range(len(split_lines[0])) ] 463
464 for i in range(len(split_lines)): 465 split_line = split_lines[i] 466 for j in range(len(split_line)): 467 value = split_line[j] 468 if value == '': 469 converted_value = None 470 else: 471 try: 472 converted_value = int(value) 473 except ValueError: 474 try: 475 converted_value = float(value) 476 except: 477 converted_value = value 478 split_line[j] = converted_value 479 split_lines[i] = split_line 480 columns = zip(*split_lines) 481 results = {} 482 for i in range(len(fields)): 483 field = fields[i] 484 try: 485 results[field] = list(columns[i]) 486 except IndexError: 487 results[field] = None 488 489 return DataFrame( results, fields_order=fields )

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the

link, since this is subject to change and can break easily.