', ''],

    [' ([original](https://bugs.python.org/file48019/SegfaultMinBugReplication.py)) ([raw](?raw))

# -*- coding: utf-8 -*- # BASEBOOK CONTENT COLLECTOR Copyright © Emmanuel Daugeras, Amdamax import glob, copy, ast, csv, io, random import requests from pandasdmx import Request from lxml import html from bs4 import BeautifulSoup import mechanicalsoup, urllib from slugify import slugify from math import floor from urllib.request import urlopen from urllib.error import URLError import sys, ast import os, json, datetime import xml.etree.ElementTree as ET import numpy as np import pandas as pd from IPython.display import display from pandas.io.json import json_normalize from os.path import isfile, join from dateutil.parser import parse from datetime import datetime import shutil from twisted.internet import reactor import scrapy from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging #********************************************************************************************* # CONTENT FOR LOCAL SETTINGS FILE #********************************************************************************************* #contains the options of execution EXECUTION_PATH = { 'task': 'Test' #'task': 'Import Data.Gouv.Fr' } PAGES_TO_PARSE ={ 'start_page': 244, 'number_of_pages': 10, 'start_dataset': 0 } #Home Folder in which the content collector is going to be executed LOCAL_FOLDER = { #******************** SETTINGS FOR MAC PRO ********************* # 'project_home_path': '/Users/amdamax/CloudStation/Basedig/Basedig Content Collector/', #Absolue path for Mac Pro # 'parsing_log_folder': '/Users/amdamax/CloudStation/Basedig/Basedig Content Collector/Log Files/', #Absolue path for Mac Pro # 'Basebook_api_log_folder': '/Users/amdamax/CloudStation/Basedig/Basedig Content Collector/Log Files/', #Absolute path Mac Pro # 'BaseBookDataContainerQueue Folder': '/Users/amdamax/CloudStation/Basedig/Basedig Content Collector/BBDC Queue/', #Absolue path Mac Pro # 'Archive Folder': '/Users/amdamax/CloudStation/Basedig/Basedig Content Collector/Archive/', #Absolue path # 'Data Sources Folder': '/Users/amdamax/CloudStation/Basedig/Basedig Content Collector/Data Sources/', #Absolue path # 'json_parsed_files_folder': '/Users/amdamax/CloudStation/Basedig/Basedig Content Collector/json_parsed_files/', #Absolue path # 'temporary_files': '/Users/amdamax/CloudStation/Basedig/Basedig Content Collector/temporary_files/', #Absolue path #******************** SETTINGS FOR MAC BOOK PRO ********************* 'project_home_path': '/Users/emmanueldaugeras/Sync Synology DS415/CloudStation/Basedig/Basedig Content Collector/', #Absolue path for Mac book Pro 'json_parsed_files_folder': '/Users/emmanueldaugeras/Sync Synology DS415/CloudStation/Basedig/Basedig Content Collector/json_parsed_files/', #Absolue path 'parsing_log_folder': '/Users/emmanueldaugeras/Sync Synology DS415/CloudStation/Basedig/Basedig Content Collector/Log Files/', #Absolue path for Mac book Pro 'Basebook_api_log_folder': '/Users/emmanueldaugeras/Sync Synology DS415/CloudStation/Basedig/Basedig Content Collector/Log Files/', #Absolute path Mac Book Pro 'BaseBookDataContainerQueue Folder': '/Users/emmanueldaugeras/Sync Synology DS415/CloudStation/Basedig/Basedig Content Collector/BBDC Queue/', #Absolute path Mac Book Pro 'Archive Folder': '/Users/emmanueldaugeras/Sync Synology DS415/CloudStation/Basedig/Basedig Content Collector/Archive/', #Absolue path 'Data Sources Folder': '/Users/emmanueldaugeras/Sync Synology DS415/CloudStation/Basedig/Basedig Content Collector/Data Sources/', #Absolue path 'temporary_files': '/Users/emmanueldaugeras/Sync Synology DS415/CloudStation/Basedig/Basedig Content Collector/temporary_files/', #Absolue path #******************** SETTINGS FOR Linux HP ********************* # 'project_home_path': '/home/emmanuel/SynologyDrive/Basedig/Basedig Content Collector/', #Absolue path for Mac Pro # 'parsing_log_folder': '/home/emmanuel/SynologyDrive/Basedig/Basedig Content Collector/Log Files/', #Absolue path for Mac Pro # 'Basebook_api_log_folder': '/home/emmanuel/SynologyDrive/Basedig/Basedig Content Collector/Log Files/', #Absolute path Mac Pro # 'BaseBookDataContainerQueue Folder': '/home/emmanuel/SynologyDrive/CloudStation/Basedig/Basedig Content Collector/BBDC Queue/', #Absolue path Mac Pro # 'Archive Folder': '/home/emmanuel/SynologyDrive/Basedig/Basedig Content Collector/Archive/', #Absolue path # 'Data Sources Folder': '/home/emmanuel/SynologyDrive/Basedig/Basedig Content Collector/Data Sources/', #Absolue path # 'json_parsed_files_folder': '/home/emmanuel/SynologyDrive/CloudStation/Basedig/Basedig Content Collector/json_parsed_files/', #Absolue path # 'temporary_files': '/home/emmanuel/SynologyDrive/Basedig/Basedig Content Collector/temporary_files/', #Absolue path #******************** SETTINGS FOR BigLinux ********************* # 'project_home_path': '/home/emmanuel/SynologyDrive/Basedig/Basedig Content Collector/', #Absolue path for Mac Pro # 'parsing_log_folder': '/home/emmanuel/SynologyDrive/Basedig/Basedig Content Collector/Log Files/', #Absolue path for Mac Pro # 'Basebook_api_log_folder': '/home/emmanuel/SynologyDrive/Basedig/Basedig Content Collector/Log Files/', #Absolute path Mac Pro # 'BaseBookDataContainerQueue Folder': '/home/emmanuel/SynologyDrive/CloudStation/Basedig/Basedig Content Collector/BBDC Queue/', #Absolue path Mac Pro # 'Archive Folder': '/home/emmanuel/SynologyDrive/Basedig/Basedig Content Collector/Archive/', #Absolue path # 'Data Sources Folder': '/home/emmanuel/SynologyDrive/Basedig/Basedig Content Collector/Data Sources/', #Absolue path # 'json_parsed_files_folder': '/home/emmanuel/SynologyDrive/CloudStation/Basedig/Basedig Content Collector/json_parsed_files/', #Absolue path # 'temporary_files': '/home/emmanuel/SynologyDrive/Basedig/Basedig Content Collector/temporary_files/', #Absolue path #******************** COMMON SETTINGS ********************* 'Log_Template_Filename': 'Data Log File Template.xlsx' } XML_PARSING_PARAMETERS = { 'max_number_trials': 3, 'maximum_file_name_length': 150, 'maximum_length_dimension_value': 20, 'number_of_lines_between_logsave': 10, 'default data source': 'testsource', 'page number keyword': 'page=', 'result per page keyword': '&per_page=', 'results per page default value': 50, 'number_first_parameter_with_?': 2, 'overwrite number of pages to parse': True, #If True, overwrites the number_of_data_pages_to_parse variable to parse all pages 'recognised_date_formats': [ '%Y', '%Y-%m', '%Y-%b', '%Y/%B', '%Y/%m', '%m-%Y', '%b-%Y', '%B-%Y', '%Y-%m-%d', '%Y-%m-%d %H:%M:%S', '%Y-%b-%d', '%Y-%B-%d', '%Y/%m/%d', '%Y/%b/%d', '%Y/%B/%d', '%d-%m-%Y', '%d-%b-%Y', '%d-%B-%Y', '%d/%m/%Y', '%d/%b/%Y', '%d/%B/%Y' ], 'recognised_csv_formats': ['utf-8', 'latin-1'] } BASEBOOK_FORMATING ={ 'max length basebook name': 60 } WORKFLOW_PARAMETERS = { #List of columns for log file. Do not change the order 'Data Collector Loging Items': [ 'Collection ID', 'Source Name', 'Collection Date', 'Parsing Status', 'Basebook Data Container File Name', 'Data Collection Parameters', 'Comments', 'Source File URL', 'Data license' ], 'Collection Log Status': [ 'Created', 'Parsed', 'Failed' ] } DATA_FRAME_PROCESSING_PARAMETERS = { 'potential_boolean_values': ['True', 'False', 'true', 'false', '0', '1', 'Y', 'N', 'Yes', 'No', 'YES', 'NO', True, False], 'values_replaced_by_none': ['', ' ', ' ', '\n', '\n ', '..', '...', '.', '_', '__', 'N.A', 'n.a'], #These values in columns are collected 'symbols_empty_fields': ['NaN', 'NaT', '' ], #Symbols which could be interpreted as empty fields 'nan_strings_to_replace_for_json_files': [[' nan,', ' None,'],[' nan}', ' None}'], [' nan]', ' None]'], [' nan ', ' None ']], 'strings_of_python_objects': ['{', '}', '[', ']'], 'lenght_of_python_string_tested': 10, 'column types to index': ['text', 'date'], #Data taypes to be indexed in the basebook 'columns_to_remove_from_data': [], #Index of columns to remove from data after it was cleaned. Useful is some columns remain with "parasitic" characters. 'strings_to_remove_from_column_names': [[' ', '_']], 'clean_up data': True, #True if the data gatherting results of the dataframe are cleanedup after the data collection 'words_to_remove_from_index': [ '', ' ', 'un', 'une', 'de', 'en', 'a', 'avec', 'dans', "sans", 'par', 'le', 'la', 'ceci', 'cela', 'mais', 'of', 'off', 'but', 'with', 'without', 'in', 'the', 'der', 'das', 'dass', 'die' ], 'strings_to_replace': [ ['Ã©', 'é'], ['â', "'"], ['Ã¢', "Â"], ['Ã', 'à'], ['', ''], ['', ''], ['&', ''], ['\n', '.'], ['\r', ''], ['\t', ' '] ], 'strings_to_avoid_in_csv_filename': [ '.zip', '.xls', '.xlsx', '.pdf', '.jpg', '.jpeg', '.png' ] } DATAGOUVFR_SOURCE_METADATA = { 'core_url': 'https://www.data.gouv.fr/api/1/datasets', 'source_name': 'DATA.GOUV.FR', 'source_data_type': 'csv', #Can be XML, JSON, CSV, or other formats 'source_type': 'URL API', #Can be 'URL' or 'FILE' 'source_core_url': '', 'source_core_input_data_path': None, #Data path of input data file if source is a file 'source_tag_prefix': '', #Prefix of the node tags from the source 'node_level_data_tag': '', #Tag for the node 0 which contains the number of pages 'page_lastupdated_tag': '', #Tag for last updated page 'page_number_tag': '', #Tag which contains the total number of pages to process 'keywords': ['France', 'opendata', 'French', 'government', 'country'], 'categories': [], #List of categories 'description': 'Open Data provided by the French government', 'language': 'en', 'data_license': 'Open License', 'collected_files_folder': 'Data Gouv Fr/' } WEB_PARSING_PARAMETER = { 'strings_to_replace': [ ['Ã©', 'é'], ['â', "'"], ['Ã¢', "Â"], ['Ã', 'à'], ['', ''], ['', ''], ['&', '']] } #********************************************************************************************* #Checks if a string is a date. Returns true if string has a potential date format #********************************************************************************************* def is_date(string): recognised_formats = XML_PARSING_PARAMETERS['recognised_date_formats'] format_index = 0 #Checks for each format if the string is recognised as a format while (format_index < len(recognised_formats)): try: result = datetime.strptime( string, recognised_formats[format_index]) return True except ValueError: format_index += 1 return(False) #********************************************************************************************* # Recognises the potential date formats for a string and returns a list of potential formats # for the date # Recognised_formats: list containing the date formats #********************************************************************************************* def recognise_date_format(string, recognised_formats): format_index = 0 potential_format_list=list() #Checks for each format if the string is recognised as a format while (format_index < len(recognised_formats)): try: result = datetime.strptime( str(string), recognised_formats[format_index]) #Adds the format to the format list potential_format_list.append(recognised_formats[format_index]) format_index += 1 except ValueError: format_index += 1 return(potential_format_list) #********************************************************************************************* #Checks if the string is potentially a number #********************************************************************************************* def is_number(string): try: float(string) return(True) except ValueError: return(False) #********************************************************************************************* #Checks if the string is potentially a number #********************************************************************************************* def is_integer(string): try: int(string) return(True) except ValueError: return(False) #********************************************************************************************* #Checks if the string is potentially a number #Checks if inputstring is potentially a python object, by detecting symbols from objectsymbolslist in # the first analength caracters of the string # returns True if the string is potentially a python object. #********************************************************************************************* def is_python_object (inputstring, objectsymbolslist, analength): bufferstring = inputstring[0:analength] for item in objectsymbolslist: if item in bufferstring: return (True) return(False) #********************************************************************************************* # Removes element from list and returns the list without the item #********************************************************************************************* def remove_from_list(sourcelist, item): resultlist = [x for x in sourcelist if x != item] return(resultlist) #********************************************************************************************* # For a given input string, replaces all substring in the pairs list by their substitutes # Used to change space into substitute characters # #********************************************************************************************* def replace_all_substrings(inputstring, replacepairs): bufferstring = inputstring for item in replacepairs: bufferstring = bufferstring.replace(item[0], item[1]) return (bufferstring) #********************************************************************************************* # For a given input string, replaces all substring in the pairs list by their substitutes # Used to change space into substitute characters # #********************************************************************************************* def replace_all_substrings_in_all_items(inputlist, replacepairs): resultlist = [ replace_all_substrings(s, replacepairs) for s in inputlist] return(resultlist) #********************************************************************************************* # Groups list items by sublists of subgroupsize #********************************************************************************************* def group_list_items(sourcelist, subgroupsize): bufferlist = [sourcelist[n:n+subgroupsize] for n in range(0, len(sourcelist), subgroupsize)] return(bufferlist) #********************************************************************************************* # Splits all strings in single words in list of strings #********************************************************************************************* def split_all_strings_in_list(sourcelist, separator=" "): resultlist = list() for item in sourcelist: bufferitem = item.split(separator) resultlist += bufferitem return(resultlist) #Generates a list of keywords from a string def generate_keywords_from_string (inputstring, cleanup=True): bufferstring = slugify(inputstring, separator="_") bufferstring = bufferstring.replace("_", " ") keywordlist = bufferstring.split(" ") if (cleanup): for item in DATA_FRAME_PROCESSING_PARAMETERS['words_to_remove_from_index']: keywordlist = remove_from_list(keywordlist, item) return(keywordlist) #Detects file separator from first characters #numberchar: number of characters in the file #dellist: list of potential delimiters to be tested def detect_file_delimiter (inputfilename, numberchar=500, dellist=[',',';'], encodinglist=XML_PARSING_PARAMETERS['recognised_csv_formats']): file = io.open(inputfilename, mode="r", encoding="utf-8") encodingformatfound = False formatindex=0 while((not encodingformatfound) and (formatindex<len(encodinglist))): try:="" file="io.open(inputfilename," mode="r" ,="" encoding="encodinglist[formatindex])" sampledata="file.read(numberchar)" encodingformatfound="True" file.close()="" except:="" formatindex="" +="1" if="" not="" encodingformatfound:="" print('encoding="" format="" found')="" return="" (false)="" maxoccurences="0" delimiter="str()" for="" item="" in="" dellist:="" sampledata.count(item)=""> maxoccurences: delimiter = item maxoccurences = sampledata.count(item) file.close() return(delimiter) #********************************************************************************************* # Class for Dataframes included in Basebook Data Containers #********************************************************************************************* class BasebookDataFrame(): dataframe = None #pd.DataFrame() def __init__(self): self.dataframe = pd.DataFrame() #********************************************************************************************* #Displays the dataframe content #********************************************************************************************* def display(self): print('Data Frame: \n', self.dataframe) #********************************************************************************************* #Checks if the column is potentially a Boolean #********************************************************************************************* def check_boolean(self, col_number): #list of potential values for Booleans #Checks if data columns is a Boolean Values_Found = 0 Values = list() Three_values_found = False line_index = 0 while ((not Three_values_found) and (line_index < self.dataframe.shape[0])): if (self.dataframe.iloc[line_index, col_number] != None): buffer_data = self.dataframe.iloc[line_index, col_number] # If value not in potential boolean values, return False if ((not pd.isnull(buffer_data)) and (not (buffer_data in DATA_FRAME_PROCESSING_PARAMETERS['potential_boolean_values']))): return(False) if ((not pd.isnull(buffer_data)) and (not (buffer_data in Values))): Values_Found += 1 Values.append(buffer_data) if Values_Found > 2: # It is not a boolean Three_values_found = True line_index = self.dataframe.shape[0] + 1 # Exits the loop return(False) line_index += 1 # Increments the counter return (True) #********************************************************************************************* #Checks if the column is potentially a Date #********************************************************************************************* def check_date(self, col_number): non_date_detected = False # Starts with flag false line_index = 0 # Starts loop while ((not non_date_detected) and (line_index < self.dataframe.shape[0])): if (self.dataframe.iloc[line_index, col_number] != None): buffer_value = str(self.dataframe.iloc[line_index, col_number]) #Checks if cell is not null if (not buffer_value in DATA_FRAME_PROCESSING_PARAMETERS['symbols_empty_fields']): # Only applies the test if data is not null if (not is_date(buffer_value)): # If non date detected, return(False) line_index += 1 #No non date field found. Returns true return(True) #********************************************************************************************* #Checks if the column is potentially an int #********************************************************************************************* def check_int(self, col_number): non_int_detected = False #Starts with flag false line_index = 0 #Starts loop while (( not non_int_detected) and (line_index < self.dataframe.shape[0])): if (self.dataframe.iloc[line_index, col_number] != None): buffer_value = str(self.dataframe.iloc[line_index, col_number]) #Checks if cell is not null if (not pd.isnull(buffer_value)): #Only applies the test if data is not null if (not is_integer(buffer_value)): #If non integer detected, return(False) line_index += 1 #No non date field found. Returns true return(True) #********************************************************************************************* #Checks if the column is potentially a number #********************************************************************************************* def check_number(self, col_number): non_int_detected = False # Starts with flag false line_index = 0 # Starts loop while ((not non_int_detected) and (line_index < self.dataframe.shape[0])): if (self.dataframe.iloc[line_index, col_number] != None): buffer_value = str(self.dataframe.iloc[line_index, col_number]) #Checks if cell is not null if (not pd.isnull(buffer_value)): # Only applies the test if data is not null if (not is_number(buffer_value)): # If non date detected, return(False) line_index += 1 #No non date field found. Returns true return(True) #********************************************************************************************* #Checks if the column is potentially made of python objects. #Returns True if at least one line is potentially a python object #********************************************************************************************* def check_pythonobject(self, col_number): non_obj_detected = False # Starts with flag false line_index = 0 # Starts loop while ((not non_obj_detected) and (line_index < self.dataframe.shape[0])): if (self.dataframe.iloc[line_index, col_number] != None): buffer_value = str(self.dataframe.iloc[line_index, col_number]) #Checks if cell is not null if (not pd.isnull(buffer_value)): # Only applies the test if data is not null if (is_python_object(buffer_value, DATA_FRAME_PROCESSING_PARAMETERS['strings_of_python_objects'], DATA_FRAME_PROCESSING_PARAMETERS['lenght_of_python_string_tested'])): # If non date detected, return(True) line_index += 1 #No non date field found. Returns true return(False) #********************************************************************************************* #Function to check if data column in dataframe is empty #returns True if column is empty. Returns False if not #********************************************************************************************* def checkemptydatacolumn (self, column_number): if column_number>self.dataframe.shape[1]: #Returns False if column number is above total number of columns return(False) for row_number in range(0,self.dataframe.shape[0]): if (pd.isnull(self.dataframe.iloc[row_number, column_number]) == False): return False return(True) #********************************************************************************************* #Function to get the list of non empty data columns #returns True if column is empty. Returns list of emty data columns #********************************************************************************************* def getnonemptydatacolumns (self): non_empty_data_columns = list() #List is empty for column_number in range(0,self.dataframe.shape[1]): if (self.checkemptydatacolumn(column_number)==False): non_empty_data_columns.append(column_number) return(non_empty_data_columns) #********************************************************************************************* #Replaces all values of data frame that are forbidden values by NaN #********************************************************************************************* def replacevaluesbyNaN (self): for col_index in range(0, self.dataframe.shape[1]): for line_index in range(0, self.dataframe.shape[0]): #Checks if value in cell should be replaced by NaN if (self.dataframe.iloc[line_index, col_index] in DATA_FRAME_PROCESSING_PARAMETERS['values_replaced_by_none']): self.dataframe.iloc[line_index, col_index] = np.nan #Replaces value by nan if (self.dataframe.iloc[line_index, col_index] in DATA_FRAME_PROCESSING_PARAMETERS['symbols_empty_fields']): self.dataframe.iloc[line_index, col_index] = np.nan #Replaces value by nan #********************************************************************************************* #Function to remove all empty data columns #********************************************************************************************* def removemptydatacolumns (self): non_empty_data_columns = list() #List is empty #Gets all empty data columns non_empty_data_columns = self.getnonemptydatacolumns() for col_to_remove in DATA_FRAME_PROCESSING_PARAMETERS['columns_to_remove_from_data']: non_empty_data_columns.remove(col_to_remove) #Removes columns from list of colomns #Slices the data framce cleaned_data_frame = self.dataframe.iloc[:,non_empty_data_columns] self.dataframe = cleaned_data_frame return(cleaned_data_frame) #********************************************************************************************* #Determines the most likely types in columns of data frame #********************************************************************************************* def determine_column_data_types(self): local_col_names = self.dataframe.columns column_types = {k: None for k in local_col_names} #Goes over all columns for col_index in range(0, len(local_col_names)): col_data_type = 'text' # String is by default column_types[local_col_names[col_index]] = col_data_type if self.check_boolean(col_index): # Checks Boolean First col_data_type = 'boolean' column_types[local_col_names[col_index]] = col_data_type else: # Checks Dates if self.check_date(col_index): col_data_type = 'date' column_types[local_col_names[col_index]] = col_data_type else: # Checks if type is int if self.check_int(col_index): col_data_type = 'integer' column_types[local_col_names[col_index]] = col_data_type else: # Checks if type is number if self.check_number(col_index): col_data_type = 'double precision' column_types[local_col_names[col_index]] = col_data_type self.columntypes = column_types.copy() return(column_types) #********************************************************************************************* #Clean up data frame #********************************************************************************************* # BUG ICI (drop 0,5 Go of ram randomly ...) def cleanupdataframe(self, removeduplicates=True): self.replacevaluesbyNaN() self.removemptydatacolumns() #Drops duplicates only if there are more than 1 lines in the data frame if removeduplicates: print('Removing duplicates') # self.dataframe.to_excel(LOCAL_FOLDER['temporary_files'] + 'dataframedropduplicates.xlsx') if (self.dataframe.shape[0]>1): try: self.dataframe.drop_duplicates() except Exception as e: print('Could not drop duplicates: ', e) #********************************************************************************************* #Imports Dataframe and cleans up the data if cleanup = True #********************************************************************************************* def import_dataframe(self, input_dataframe, cleanup=False, removeduplicates = True): print('Dataframe copy') self.dataframe = input_dataframe.copy() print('Cleanup') if cleanup: self.cleanupdataframe( removeduplicates=removeduplicates) #********************************************************************************************* # Class to represent all potential combinations of a set of parameters. #********************************************************************************************* class ParameterSet(): parameters = None #dict() #Dictionnary with Parameter names and list of potential values for each parameter parameter_universe = None #pd.DataFrame() #Array with all potential combinations of valus of each parameter number_of_variable_parameters = 0 def __init__(self): self.parameters = dict() self.parameter_universe = pd.DataFrame() self.number_of_variable_parameters = 0 #Initialises the parameters def initialise(self, initnewparameters = True): if initnewparameters: newparameters = dict() self.parameters = newparameters.copy() newdataframe = pd.DataFrame() self.parameter_universe = newdataframe.copy() number_of_variable_parameters = 0 def display(self): print('********* PARAMETER SET ***********') print('Parameters:\n', self.parameters) print('Parameter_univere: \n', self.parameter_universe) print('********* END OF PARAMETER SET ***********\n') #********************************************************************************************* # Base Class for Data Source #********************************************************************************************* class DataSource: source_metadata = None def __init__(self): self.source_metadata = dict() self.source_metadata['source_name'] = str() self.source_metadata['source_data_type'] = str() self.source_metadata['source_type'] = str() self.source_metadata['source_core_url'] = str() self.source_metadata['source_core_input_data_path'] = str() self.source_metadata['source_tag_prefix'] = str() self.source_metadata['node_level_data_tag'] = str() self.source_metadata['page_lastupdated_tag'] = str() self.source_metadata['keywords'] = list() self.source_metadata['categories'] = list() self.source_metadata['description'] = str() self.source_metadata['data_license'] = str() def display(self): print('Source metadata: \n', self.source_metadata) #********************************************************************************************* # Base Class for Data Collectors #********************************************************************************************* class DataCollectorLog(): logitems = None #WORKFLOW_PARAMETERS['Data Collector Loging Items'] #Loads the names of loging fields logdata = None #pd.DataFrame() Contains the log data def __init__(self): self.logitems = WORKFLOW_PARAMETERS['Data Collector Loging Items'] self.logdata = pd.DataFrame() def read_logfile(self, filename): pathandfilename = LOCAL_FOLDER['parsing_log_folder'] + filename try: file = open( pathandfilename , 'r') self.logdata = pd.read_excel(pathandfilename) file.close() except: print('Error while opening Data Collector Log File: ', pathandfilename ) return(False) return(True) def save_logfile(self, filename): bufferfilename = filename+'.xlsx' self.logdata.to_excel(bufferfilename) def logdisplay(self): print('******************** DATA COLLECTOR LOG ************************') print('Log Items: ', self.logitems) print ('Log Data: \n', self.logdata) print('******************** END OF DATA COLLECTOR LOG ************************') #********************************************************************************************* # Base Class for Data Collectors #********************************************************************************************* class DataCollector(DataSource, DataCollectorLog): #Dictionnary containing the parameters, and for each parameter the list of potential values # Ex: {'Parameter 1': {'Keyword1', [Value1, Value2]}, 'Parameter 2': {'Keyword2', [Value3, Value4]}} # Ex: {'Indicator': '} source_parameters = None #dict() parameter_medatada = None #dict () #Contains for each parameter, the parameter values as key, and the {'Description':, 'Categories': and 'Keywords': metadata} parsing_parameters_imported = False #Flag to check if parameters are imported parsing_scheme_built = False basebookdata = None #BasebookDataFrame() basebookcontainer = None#BasebookDataContainer() parsing_result_data = None #BasebookDataFrame() #Dataframe which contains the result of the data parsing parsing_scheme = None #ParameterSet() #Parsing scheme for all parameter combinations def __init__(self, inputsource=None): DataSource.__init__(self) DataCollectorLog.__init__(self) self.logitems = WORKFLOW_PARAMETERS['Data Collector Loging Items'] self.logdata = pd.DataFrame() self.source_parameters = dict() self.parameter_medatada = dict () #Contains for each parameter, the parameter values as key, and the {'Description':, 'Categories': and 'Keywords': metadata} #Imports directly the input source metadata if available if (inputsource != None): self.source_metadata = inputsource self.parsing_parameters_imported = False #Flag to check if parameters are imported self.parsing_scheme_built = False self.parsing_result_data = BasebookDataFrame() #Dataframe which contains the result of the data parsing self.parsing_scheme = ParameterSet() #Parsing scheme for all parameter combinations self.basebookdata = BasebookDataFrame() if (inputsource == None): self.basebookcontainer = BasebookDataContainer() else: #Imports the metadata directly from the source self.basebookcontainer = BasebookDataContainer(id=None, source=inputsource['source_name'], name=inputsource['source_name'], description=inputsource['description'], keywords=inputsource['keywords'], datalicense= inputsource['data_license']) def display(self): print('********* DATA COLLECTOR ***********') super().display() print('Source Parameters: \n', self.source_parameters) print('Parsing scheme: \n') self.parsing_scheme.display() print('Basebook data:\n') self.basebookdata.display() print('Basebook Data Container: ') self.basebookcontainer.display() print('Parsed Data:') #Adds the parsing results to the display self.parsing_result_data.display() print('********* END OF DATA COLLECTOR ***********\n') #********************************************************************************************* # Inserts a log line into the collector log data frame # Log status is a number: 0 is created, 1 is 'Parsed', 2 is 'Failed' #********************************************************************************************* def add_collector_log_line(self, sourcename, logstatus, resultfilename, parameterdescription, logcomment=None, sourcefileurl=None, datalicense=None): collection_line_items = dict() lognumber = self.logdata.shape[0] #Number of rows of the existing dataframe is the log id #Adds the log number and source name collection_line_items[WORKFLOW_PARAMETERS['Data Collector Loging Items'][0]] = int(lognumber) collection_line_items[WORKFLOW_PARAMETERS['Data Collector Loging Items'][1]] = self.source_metadata['source_name'] #Inserts the log date now = datetime.now() collection_line_items[WORKFLOW_PARAMETERS['Data Collector Loging Items'][2]] = now.strftime("%Y-%m-%d %H:%M") collection_line_items[WORKFLOW_PARAMETERS['Data Collector Loging Items'][3]] = WORKFLOW_PARAMETERS['Collection Log Status'][logstatus] collection_line_items[WORKFLOW_PARAMETERS['Data Collector Loging Items'][4]] = resultfilename collection_line_items[WORKFLOW_PARAMETERS['Data Collector Loging Items'][5]] = parameterdescription collection_line_items[WORKFLOW_PARAMETERS['Data Collector Loging Items'][6]] = logcomment collection_line_items[WORKFLOW_PARAMETERS['Data Collector Loging Items'][7]] = sourcefileurl collection_line_items[WORKFLOW_PARAMETERS['Data Collector Loging Items'][8]] = datalicense self.logdata = self.logdata.append(collection_line_items, ignore_index=True) return(collection_line_items) #******************************************** # Class DataCollector # Imports all basebooks from the core URL of the source # Stores basebook in the outputdirectory #******************************************************************* def parse_all_basebooks_from_datagouvfr_page (self, coreurl, pagenumber, startdataset=None, stopdataset=None, outputdirectory=None, inputformat='csv', engine='c', cleanup= True, removeduplicates=True, pagesize=20): bufferbasebook = BasebookDataContainer() firstmetadata = bufferbasebook.import_json_metadata_from_datagouvfr(coreurl, pagenumber, pagesize=pagesize) #Sets boundaries if (startdataset != None): datasetmin = max(0,startdataset) else: datasetmin = 0 if (stopdataset != None): datasetmax = min(stopdataset, len(firstmetadata["data"])) else: datasetmax = pagesize parsedfilecounter = 0 print('***************************PARSING FILES OF PAGE {}**********************************'.format(pagenumber)) print('Format: ', inputformat, 'Start Dataset: ', datasetmin, 'End Dataset: ', datasetmax, 'Parsing engine: ', engine) #Scans all datasets for dataindex in range( datasetmin, datasetmax): for resourceindex in range (0, len(firstmetadata["data"][dataindex]["resources"])): print('Parsing dataset: Page {}, Dataset {}, Resource {}'.format(pagenumber, dataindex, resourceindex)) #If the format of the resource matches the inputformat parameter, the file is parsed if (firstmetadata["data"][dataindex]["resources"][resourceindex]["format"] == inputformat): resultbasebook = self.generate_basebook_from_datagouvfr_page ( coreurl=coreurl, pagenumber=pagenumber, datasetnumber=dataindex, resourcenumber=resourceindex, inputformat=inputformat, engine=engine, cleanup=cleanup, removeduplicates=removeduplicates, pagesize=pagesize) if (resultbasebook == False): print('Basebook could not be generated. Dataset {}, Resource {}'.format(dataindex, resourceindex)) else: #Basebook generation successfull-File stored if (outputdirectory !=None): filename = outputdirectory else: filename = str() filename += resultbasebook.datacontainer['metadata']['basebook']['name'] print('Saving resource to file: ', filename) debug=False resultbasebook.save_to_json(filename, '__all__', xlscopy=False, formatdata=True, debuginfo=debug) print('File saved to json successfully') parsedfilecounter +=1 else: print('Resource format ', firstmetadata["data"][dataindex]["resources"][resourceindex]["format"], ' does not match inputformat. Resource not parsed.') print('***************************PAGE {} FULLY PARSED - {} FILES COLLECTED**********************************'.format(pagenumber, parsedfilecounter)) return(parsedfilecounter) #******************************************** # Class DataCollector # Imports all basebooks from the core URL of the source # Stores basebook in the outputdirectory #******************************************************************* def generate_basebook_from_datagouvfr_page (self, coreurl, pagenumber, datasetnumber, resourcenumber, inputformat='csv', engine='c', cleanup= True, removeduplicates=True, pagesize=20): bufferbasebook = BasebookDataContainer() pagemetadata = bufferbasebook.import_json_metadata_from_datagouvfr(coreurl, pagenumber, pagesize=pagesize) #Loads the metadata bufferbasebook.datacontainer["metadata"]["basebook"]["name"] = str(resourcenumber) +'-'+pagemetadata["data"][datasetnumber]["title"] bufferbasebook.datacontainer["metadata"]["basebook"]["source"] = self.source_metadata["source_name"] #Imports the tags into the keywords bufferlist = list() if (pagemetadata['data'][datasetnumber]['tags'] != None): for item in pagemetadata['data'][datasetnumber]['tags']: bufferstring = item.replace("-", " ") bufferlist.append(bufferstring) bufferbasebook.datacontainer["metadata"]["basebook"]["keywords"] = bufferlist bufferbasebook.datacontainer['metadata']['basebook']['description'] = pagemetadata["data"][datasetnumber]["title"] + ". Données fournies par Data.Gouv.fr. " if (pagemetadata['data'][datasetnumber]['organization'] != None ): bufferbasebook.datacontainer['metadata']['basebook']['description'] += ' et ' + pagemetadata['data'][datasetnumber]['organization']['name'] + '. ' if (pagemetadata['data'][datasetnumber]['owner'] != None ): bufferbasebook.datacontainer['metadata']['basebook']['description'] += pagemetadata['data'][datasetnumber]['owner']['first_name'] + ' ' + pagemetadata['data'][datasetnumber]['owner']['last_name'] bufferbasebook.datacontainer["metadata"]["basebook"]["description"] += ' ' + pagemetadata["data"][datasetnumber]["description"] bufferbasebook.datacontainer["metadata"]["basebook"]["data_license"] = pagemetadata["data"][datasetnumber]["license"] #Imports the Data URL datafileurl = pagemetadata['data'][datasetnumber]['resources'][resourcenumber]['url'] print('Datafile URL:', datafileurl) bufferformat = '.' + inputformat if bufferformat in datafileurl: importresult = bufferbasebook.import_csv_data_from_url(datafileurl, inputcleanup=True, separator=None, cleancolumnnames=True, formatlist=XML_PARSING_PARAMETERS['recognised_csv_formats'], engine=engine, debuginfo=False, removeduplicates=removeduplicates) print('Basebook imported successfully') #Adds collector log line self.add_collector_log_line(self.source_metadata['source_name'], 1, bufferbasebook.datacontainer['metadata']['basebook']['name'], "File Parsing", logcomment='Success', sourcefileurl=datafileurl, datalicense=bufferbasebook.datacontainer['metadata']['basebook']['data_license']) else: print('Basebook data not imported: file format does not match input format: ', inputformat) importresult = False bufferbasebook.format_for_export() bufferbasebook.display(data=False) if (importresult == False): return(False) return(bufferbasebook) #******************************************** # Class DataCollector # Imports all basebooks from the core URL of the source # Stores basebooks in the outputdirectory #******************************************************************* def parse_all_pages_from_datagouvfr (self, coreurl, startpage, endpage, startdataset=None, stopdataset=None, outputdirectory=None, inputformat='csv', engine='c', cleanup=True, removeduplicates=True, pagesize=20): print('******************************PARSES ALL PAGES FROM DATA.GOUV.FR***********************************') print('Start page: ', startpage, 'End page: ', endpage, 'Input format: ', inputformat, 'Engine: ', engine) parsedfilecounter = 0 parsingstarttime = datetime.now() for pageindex in range(startpage, endpage): parsedfilecounter += self.parse_all_basebooks_from_datagouvfr_page( coreurl, pageindex, startdataset=startdataset, stopdataset=stopdataset, outputdirectory=outputdirectory, inputformat=inputformat, engine=engine, cleanup=cleanup, removeduplicates=removeduplicates, pagesize=pagesize) #self.save_logfile(LOCAL_FOLDER['parsing_log_folder']+ self.source_metadata['source_name'] + LOCAL_FOLDER['File_parsing_log_filename'] + parsingstarttime.strftime("%Y-%m-%d %Hh%M")) print('**********************PAGES {} TO {} PARSED SUCCESSFULLY - {} FILES PARSED ***********************************'.format(startpage, endpage-1, parsedfilecounter)) return(parsedfilecounter) #********************************************************************************************* #Basebook Column for Basebook Data Container #********************************************************************************************* class BasebookDataColumn: datacontainer = None def __init__(self): self.datacontainer = { "name": str(), "index_key": "name_only", # index key determines how data is indexed. must be none or name_only or indexed. Name only by default. "description": str(), "keywords": list(), #[], "db_type": str(), "date_format": str() } #Displays content of object def display(self): print(self.datacontainer) #Generates keywords list from column name def generate_keywords_from_name(self): bufferlist=list() bufferstring = slugify(self.datacontainer['name'], separator="_") #Generates keyword list from column names bufferlist = bufferstring.split("_") if (self.datacontainer['keywords'] != None): self.datacontainer['keywords'] += bufferlist else: self.datacontainer['keywords'] = bufferlist #Removes duplicates in list self.datacontainer['keywords'] = list(set(self.datacontainer['keywords'])) #********************************************************************************************* #Provides data about the data source page #********************************************************************************************* class BasebookDataContainer: datacontainer = None def __init__(self, id=None, source=None, name=None, description=None, keywords=None, datalicense=None, sourcemetadata=None): self.datacontainer = { "metadata": { "basebook": { "id": None, "source": None, #XML_PARSING_PARAMETERS['default data source'], #Source name "name": None, #str(), #Basebook Name "description": None, #str(), #Basebook description "keywords": None, #[], #Keywords. Are created by API if does not exist already "categories": None, #Categories # Database content orientation horizontal or vertical ( vertical by default) "content_orientation": 'vertical', "x_columns": None, "y_columns": None, # Specific id for permission level "access_rights": 2, #Access right level. 2 by default. Enables viewing the structured data in browser "related_basebooks": None, "language": "en-us", "data_license": None, #Data License "columns": None #[], #List containing the Column Data Containers } }, 'data': None #dict() #Contains the actual data. Will be stored in the form of dictionnary } #Inputs data from single parameters if (sourcemetadata == None): self.datacontainer['metadata']['basebook']['id'] = id self.datacontainer['metadata']['basebook']['source'] = source self.datacontainer['metadata']['basebook']['name'] = name self.datacontainer['metadata']['basebook']['description'] = description if keywords != None: self.datacontainer['metadata']['basebook']['keywords'] = keywords else: self.datacontainer['metadata']['basebook']['keywords'] = list() self.datacontainer['metadata']['basebook']['categories'] = list() self.datacontainer['metadata']['basebook']['x_columns'] = list() self.datacontainer['metadata']['basebook']['y_columns'] = list() self.datacontainer['metadata']['basebook']['related_basebooks'] = list() self.datacontainer['metadata']['basebook']['data_license'] = datalicense self.datacontainer['metadata']['basebook']['columns'] = list() self.datacontainer['data'] = dict() else: self.datacontainer['metadata']['basebook']['source'] = sourcemetadata['source_name'] self.datacontainer['metadata']['basebook']['name'] = str() self.datacontainer['metadata']['basebook']['description'] = sourcemetadata['description'] if sourcemetadata['keywords'] != None: self.datacontainer['metadata']['basebook']['keywords'] = sourcemetadata['keywords'] else: self.datacontainer['metadata']['basebook']['keywords'] = list() self.datacontainer['metadata']['basebook']['categories'] = sourcemetadata['categories'] self.datacontainer['metadata']['basebook']['x_columns'] = list() self.datacontainer['metadata']['basebook']['y_columns'] = list() self.datacontainer['metadata']['basebook']['related_basebooks'] = list() self.datacontainer['metadata']['basebook']['data_license'] = sourcemetadata['data_license'] self.datacontainer['metadata']['basebook']['columns'] = list() self.datacontainer['data'] = dict() #Displays content of object def display(self, data=True): print(self.datacontainer['metadata']) if data: print('Data: ') print(self.datacontainer['data']) # Saves data object to JSON File. Fieldname is either the Metadata or the data. # filename: name of file without json extension # containerfieldname specifies if only the metadata is stored, or if the data is stored as well. By default: data. if '__all__', all the fields are loaded # if xlscopy, stores in the same folder a copy of data and metadata def save_to_json(self, filename, containerfieldname='__all__', xlscopy=False, formatdata=True, debuginfo=False ): #Stores json file first buffer_filename = filename + '.json' buffer_dataframe = pd.DataFrame() print('Saving to json') with open(buffer_filename, 'w') as outfile: if containerfieldname != '__all__': print('Writing to string:') teststring = json.dumps(self.datacontainer[containerfieldname], allow_nan=True, sort_keys = True, indent = 4, ensure_ascii = False) print('String dump:', teststring) json.dump(self.datacontainer[containerfieldname], outfile, allow_nan=True, sort_keys = True, indent = 4, ensure_ascii = False) else: if formatdata: self.format_for_export() if (debuginfo): print('Data container keys:', self.datacontainer['data'].keys()) #Saving dict to file: with open('dataverif.csv', 'w') as csv_file: writer = csv.writer(csv_file) for key, value in self.datacontainer.items(): writer.writerow([key, value]) try: json.dump(self.datacontainer, outfile, allow_nan=True, sort_keys = True, indent = 4, ensure_ascii = False) except Exception as message: print('Error while saving basebook to json. Error message:', message) return(False) outfile.close() print('Basebook stored to JSON File: ', filename) #If necessary, Stores data into XLSX file first if(xlscopy): buffer_filename = filename + '.xlsx' buffer_dataframe=pd.DataFrame.from_dict(self.datacontainer['data']) buffer_dataframe.to_excel(buffer_filename) #Stores the dataframe in XLS format print('Basebook data stored to XLS File: ', filename) del buffer_filename return(True) #Loads Data object from JSON File. Fieldname is either the Metadata or the data. By default: data. if '__all__', all the fields are loaded def load_from_json(self, filename, containerfieldname='__all__'): print('Loading basebook from file: ', filename) json_string = str() with open(filename) as infile: json_string=infile.read() if containerfieldname != '__all__': #Just loads one field self.datacontainer[containerfieldname]=json.loads(json_string) else: #Loads everything self.datacontainer['metadata']=json.loads(json_string)['metadata'] self.datacontainer['data']=json.loads(json_string)['data'] infile.close() print('Basebook loaded from JSON File: ', filename) #Loads Data object from JSON File. Fieldname is either the Metadata or the data. By default: data. if '__all__', all the fields are loaded def load_from_csv(self, filename, cleanup = True, separator = None, cleancol = False, encoding='utf-8', formatlist = XML_PARSING_PARAMETERS['recognised_csv_formats'], engine='c', debuginfo=False, removeduplicates=True): print('Loading from csv file: ', filename) #Checks if forbidden strings are in file name: for forbidstring in DATA_FRAME_PROCESSING_PARAMETERS['strings_to_avoid_in_csv_filename']: if forbidstring in filename: print("File contains forbidden types and is probably not a .csv - not loaded") return(False) bufferdataframe = BasebookDataFrame() formatfound = False formatindex = 0 if formatlist == None: formatlist = list(encoding) #Detects the separator of the CSV file: if separator == None: delimiter = detect_file_delimiter(filename) else: delimiter = separator print('File column delimiter: ', delimiter) print('Format list: ', formatlist) #Scans all potential formats to find the suitable format while ((not formatfound) and (formatindex < len(formatlist))): try: print('Reading dataframe from csv: ', filename, 'delimiter: ', delimiter, 'Encoding: ', formatlist[formatindex], 'Engine: ', engine) bufferdataframe.dataframe = pd.read_csv(filename, sep=delimiter, encoding=formatlist[formatindex], engine=engine) print ('Format found: ', formatlist[formatindex]) formatindex = len(formatlist) formatfound = True except Exception as message: print('Exception while reading file: ', message) formatindex +=1 if (not formatfound): print('Error while reading file. Format not recognised. Format list: ', formatlist) return(False) if (debuginfo == True): print('Dataframe to import:') bufferdataframe.display() self.import_dataframe(bufferdataframe.dataframe, generatemetadata=True, cleanupdata=True, removeduplicates=removeduplicates) print('Basebook loaded from CSV File: ', filename) del(bufferdataframe) return(True) #Imports JSON data and formats the data into Basebook Data Container #File is downloaded first into a temporary folder #Cleanup = True: data is cleaned up def import_json_data_from_url (self, inputurl, cleanup = True, debuginfo=False): now = datetime.now() randomid = random.randint(1,101) #Opens file and downloads file to temporary folder #File name contains random numbers and time stamp to make it unique even in case of multicore execution filename = LOCAL_FOLDER['temporary_files'] + 'temporaryfile' + now.strftime("%Y-%m-%d %H:%M") + " " + str(randomid) + '.json' print('Downloading file from url: ', inputurl) urllib.request.urlretrieve(inputurl, filename) print('File downloaded successfuly\n') #Loads the data from the JSON file self.load_from_json(filename, 'data') if cleanup: print('Cleaning up data') self.cleanup_opendataparis_data() #deletes the file if os.path.exists(filename): os.remove(filename) def import_excel_data_from_url (self, inputurl, inputcleanup = True): #Opens file and downloads file to temporary folder filename = LOCAL_FOLDER['temporary_files'] + 'temporaryfile.xls' print('Downloading file from url: ', inputurl) try: urllib.request.urlretrieve(inputurl, filename) print('File downloaded successfuly\n') except: print('import_excel_data_from_url Error while retrieving URL: ', inputurl) return(False) #Loads the data from the JSON file result = self.load_from_excel(filename, cleanup=inputcleanup) #deletes the file if os.path.exists(filename): os.remove(filename) return(result) #******************************************************************* # Class Basebookdatacontainer # Imports CSV Data from URL #******************************************************************* def import_csv_data_from_url (self, inputurl, inputcleanup = True, separator = None, cleancolumnnames = True, encoding='utf-8', formatlist = None, engine='c', debuginfo = False, removeduplicates = True): now = datetime.now() randomid = random.randint(1,101) #Opens file and downloads file to temporary folder #File name contains random numbers and time stamp to make it unique even in case of multicore execution filename = LOCAL_FOLDER['temporary_files'] + 'temporaryfile' + now.strftime("%Y-%m-%d %H:%M") + " " + str(randomid) + '.json' if debuginfo: print('Start of import') print('Downloading file from url: ', inputurl) try: urllib.request.urlretrieve(inputurl, filename) print('File opened successfuly\n') except: print('import_csv_data_from_url error while retrieving url:', inputurl) return(False) #Loads the data from the JSON file print('Loading from csv. Engine: ', engine) result = self.load_from_csv(filename, cleanup=inputcleanup, separator=separator, cleancol=cleancolumnnames, encoding=encoding, formatlist=formatlist, engine=engine, debuginfo=debuginfo, removeduplicates=removeduplicates) print('Basebook Data Container loaded from csv successfully') #deletes the file if os.path.exists(filename): os.remove(filename) return(result) #******************************************************************* # Class Basebookdatacontainer # Imports Metadata from DataGouv # coreurl: URL of the API # Pagenumber: Page number in the dataset # format: Format of file to be downloaded # Pagesize: number of records per age # Returns a dict with all the metadata #******************************************************************* def import_json_metadata_from_datagouvfr (self, coreurl, pagenumber, format='csv', pagesize=20): #Computes the URL to retrieve the data inputurl = coreurl + '/?format=' + format + '&page=' + str(pagenumber) + '&page_size=' + str(pagesize) print('Importing metadata from url: ', inputurl) with urllib.request.urlopen(inputurl) as url: data = json.loads(url.read().decode()) print("Imported JSON Metadata successfully. Number of datasets: ", len(data['data'])) return(data) # Imports dataframe into data container. If generatemetadata=True: metadata for columns are prepared. If cleanupdata: data are cleaned def import_basebookdataframe(self, inputbasebookdataframe, generatemetadata=True): buffer_column_list = list() cleaned_data_frame = pd.DataFrame() cleaned_data_frame = inputbasebookdataframe.dataframe.copy() local_column_types=inputbasebookdataframe.determine_column_data_types() if(generatemetadata): #Generates Metadata #Generates the columns and the column names for column in range(0,cleaned_data_frame.shape[1]): localdatacolumn = BasebookDataColumn() localdatacolumn.datacontainer['name']=slugify(cleaned_data_frame.columns[column], separator="_") #Generates keywords list from name localdatacolumn.generate_keywords_from_name() localdatacolumn.datacontainer['db_type']=local_column_types[cleaned_data_frame.columns[column]] if (local_column_types[cleaned_data_frame.columns[column]] in DATA_FRAME_PROCESSING_PARAMETERS ['column types to index']): localdatacolumn.datacontainer['index_key'] = 'indexed' #Checks if the column potentially contains python objects, in order not to index if (inputbasebookdataframe.check_pythonobject(column)): localdatacolumn.datacontainer['index_key'] = 'name_only' else: localdatacolumn.datacontainer['index_key'] = 'name_only' buffer_column_list.append(localdatacolumn.datacontainer.copy()) #Adds a copy not to pass the same adress self.datacontainer['metadata']['basebook']['columns'] = buffer_column_list self.cleanup_column_names() #Imports the data into the list and slugifies the data column names bufferdict = cleaned_data_frame.to_dict('dict') self.datacontainer['data'] = dict() #Slugifies the column names in the data for item in bufferdict.keys(): bufferstring = slugify(item, separator="_") self.datacontainer['data'][bufferstring]=bufferdict[item] #Imports a dataframe directly into basebook. Cleans up data if necessary and generates metadata def import_dataframe(self, inputdataframe, generatemetadata=True, cleanupdata=True, removeduplicates=True): bufferbbdf = BasebookDataFrame() bufferbbdf.import_dataframe(inputdataframe, cleanup=cleanupdata, removeduplicates=removeduplicates) self.import_basebookdataframe(bufferbbdf, generatemetadata=generatemetadata) #converts all nan values to None in order to be able to enable JSON export def convert_nanvalues_to_none (self): #Converts the metadata into a string without nan #print('\nBasebook Metadata before converting nan values:', self.datacontainer['metadata']) buffer_string = str(self.datacontainer['metadata']) #Replaces all nan values by None for item in DATA_FRAME_PROCESSING_PARAMETERS['nan_strings_to_replace_for_json_files']: buffer_string = buffer_string.replace(item[0], item[1]) #print('\n\nBuffer String before litteral: ', buffer_string) self.datacontainer['metadata'] = eval(buffer_string) #Converts string back into dictionnary #print('\nBasebook Metadata after converting nan values:', self.datacontainer['metadata']) buffer_string = str(self.datacontainer['data']) #Replaces all nan values by None for item in DATA_FRAME_PROCESSING_PARAMETERS['nan_strings_to_replace_for_json_files']: buffer_string = buffer_string.replace(item[0], item[1]) self.datacontainer['data'] = eval(buffer_string) #Cleans up the column names and column keywords prior to export def cleanup_column_names(self): for colindex in range(0, len(self.datacontainer['metadata']['basebook']['columns'])): #Splits column keyword list self.datacontainer['metadata']['basebook']['columns'][colindex]['keywords'] = split_all_strings_in_list (self.datacontainer['metadata']['basebook']['columns'][colindex]['keywords'], " ") for reppair in DATA_FRAME_PROCESSING_PARAMETERS['strings_to_remove_from_column_names']: self.datacontainer['metadata']['basebook']['columns'][colindex]['name'] = self.datacontainer['metadata']['basebook']['columns'][colindex]['name'].replace(reppair[0], reppair[1]) #********************************************************************************************* # Formats the Basebook for export # Slugifies the column names # Splits keyword lists into single keywords # Makes sure that the columns a lists and not None #********************************************************************************************* def format_for_export (self, sourcename = None, datalicense = None): #Slugifies the column names in the data: bufferdict = dict() for item in self.datacontainer['data'].keys(): bufferstring = slugify(item, separator = "_") bufferdict[bufferstring] = self.datacontainer['data'][item] self.datacontainer['data'] = bufferdict #Slugifies the column names in the column lists and splits the keyword lists for colindex in range(0, len(self.datacontainer['metadata']['basebook']['columns'])): self.datacontainer['metadata']['basebook']['columns'][colindex]['name'] = slugify (self.datacontainer['metadata']['basebook']['columns'][colindex]['name'], separator="_") #Splits the keyword lists in single words if(self.datacontainer['metadata']['basebook']['columns'][colindex]['keywords'] != None): self.datacontainer['metadata']['basebook']['columns'][colindex]['keywords'] = split_all_strings_in_list (self.datacontainer['metadata']['basebook']['columns'][colindex]['keywords'], " ") else: self.datacontainer['metadata']['basebook']['columns'][colindex]['keywords'] = list() #Generates keywords from name buffercolumn = BasebookDataColumn() buffercolumn.datacontainer['name'] = self.datacontainer['metadata']['basebook']['columns'][colindex]['name'] buffercolumn.datacontainer['keywords'] = self.datacontainer['metadata']['basebook']['columns'][colindex]['keywords'] buffercolumn.generate_keywords_from_name() self.datacontainer['metadata']['basebook']['columns'][colindex]['keywords'] = buffercolumn.datacontainer['keywords'] #Removes duplicates in Keywords list self.datacontainer['metadata']['basebook']['columns'][colindex]['keywords'] = list(set(self.datacontainer['metadata']['basebook']['columns'][colindex]['keywords'])) #Cleans up the keywords from forbidden strings self.datacontainer['metadata']['basebook']['columns'][colindex]['keywords'] = replace_all_substrings_in_all_items (self.datacontainer['metadata']['basebook']['columns'][colindex]['keywords'], DATA_FRAME_PROCESSING_PARAMETERS['strings_to_replace']) #Formats the description and replaces _ by spaces and removes wrong formating characters if(self.datacontainer['metadata']['basebook']['columns'][colindex]['description'] != None): self.datacontainer['metadata']['basebook']['columns'][colindex]['description'] = replace_all_substrings(self.datacontainer['metadata']['basebook']['columns'][colindex]['description'],[["_", " "], ["-", " "]]) self.datacontainer['metadata']['basebook']['columns'][colindex]['description'] = replace_all_substrings(self.datacontainer['metadata']['basebook']['columns'][colindex]['description'], DATA_FRAME_PROCESSING_PARAMETERS['strings_to_replace']) #Determines the date format if the column is of date type if(self.datacontainer['metadata']['basebook']['columns'][colindex]['db_type'] == 'date'): potential_format_list = self.recognise_column_date_format(self.datacontainer['metadata']['basebook']['columns'][colindex]['name'], XML_PARSING_PARAMETERS['recognised_date_formats']) if (potential_format_list == False): self.datacontainer['metadata']['basebook']['columns'][colindex]['date_format'] = None else: self.datacontainer['metadata']['basebook']['columns'][colindex]['date_format'] = potential_format_list[0] #Splits the keywords and makes it a list if None if (self.datacontainer['metadata']['basebook']['keywords'] != None): self.datacontainer['metadata']['basebook']['keywords'] = split_all_strings_in_list(self.datacontainer['metadata']['basebook']['keywords'], " ") else: self.datacontainer['metadata']['basebook']['keywords'] = list() #Removes duplicates in Keywords list self.datacontainer['metadata']['basebook']['keywords'] = list(set(self.datacontainer['metadata']['basebook']['keywords'])) #Cleans up the keywords list from forbidden characters self.datacontainer['metadata']['basebook']['keywords'] = replace_all_substrings_in_all_items (self.datacontainer['metadata']['basebook']['keywords'], DATA_FRAME_PROCESSING_PARAMETERS['strings_to_replace']) #Inputs the source name: if (sourcename != None): if (self.datacontainer['metadata']['basebook']['source'] == None): self.datacontainer['metadata']['basebook']['source'] = sourcename #Formats the description and replaces _ by spaces if(self.datacontainer['metadata']['basebook']['description'] != None): self.datacontainer['metadata']['basebook']['description'] = replace_all_substrings(self.datacontainer['metadata']['basebook']['description'], [["_", " "], ["-", " "], [" ", " "], [" ", " "]]) self.datacontainer['metadata']['basebook']['description'] = replace_all_substrings(self.datacontainer['metadata']['basebook']['description'], DATA_FRAME_PROCESSING_PARAMETERS['strings_to_replace']) #Formats the name and replaces _ by spaces if(self.datacontainer['metadata']['basebook']['name'] != None): self.datacontainer['metadata']['basebook']['name'] = slugify(self.datacontainer['metadata']['basebook']['name'], separator="_") self.datacontainer['metadata']['basebook']['name'] = replace_all_substrings(self.datacontainer['metadata']['basebook']['name'],[["_", " "], ["-", " "], [" ", " "], [" ", " "]]) #Reduces the size to the maximal allowed size for POSTGRESQL self.datacontainer['metadata']['basebook']['name'] = self.datacontainer['metadata']['basebook']['name'][0:BASEBOOK_FORMATING['max length basebook name']] #Changes the string self.datacontainer['metadata']['basebook']['source'] = self.datacontainer['metadata']['basebook']['source'].upper() self.datacontainer['metadata']['basebook']['source'] = self.datacontainer['metadata']['basebook']['source'].replace("_", " ") #Inputs the data license: if (self.datacontainer['metadata']['basebook']['data_license'] == None): self.datacontainer['metadata']['basebook']['data_license'] = datalicense #Formats the X and Y Columns: if (self.datacontainer['metadata']['basebook']['x_columns'] == None): self.datacontainer['metadata']['basebook']['x_columns'] = list() if (self.datacontainer['metadata']['basebook']['y_columns'] == None): self.datacontainer['metadata']['basebook']['y_columns'] = list() #********************************************************************************************* # Recognises the potential date format of a data column # Assumes the column has only date data # Returns a list of potential date formats if found. Returns False if data format not found. #********************************************************************************************* def recognise_column_date_format(self, columnkey, recognisedformats): formatlist = recognisedformats formatfound = False rowindex=0 datakeys = list(self.datacontainer["data"][columnkey].keys()) while ((not formatfound) and (rowindex < len(datakeys))): bufferlist = recognise_date_format(self.datacontainer["data"][columnkey][datakeys[rowindex]], formatlist) #If the list of potential formats is empty, the date format is not recognised if len(bufferlist) == 0: return(False) #If length is 1, there is only one potential format if len(bufferlist) == 1: formatfound = 1 return(bufferlist) #If list has several values, continue scaning to reduce the number of potential formats formatlist = bufferlist rowindex += 1 # ****************************************************************************************************************** # MAIN # ****************************************************************************************************************** print('\n\n************************** START BASEDIG CONTENT COLLECTOR AT ', datetime.now(), ' ********************************') print('Task: ',EXECUTION_PATH['task']) if EXECUTION_PATH['task']== 'Import Data.Gouv.Fr': basedigcollector = DataCollector(DATAGOUVFR_SOURCE_METADATA) # importmetadata = basedigcollector.basebookcontainer.import_json_metadata_from_datagouvfr(DATAGOUVFR_SOURCE_METADATA['core_url'], 0) # basedigcollector.basebookcontainer.generate_basebooks_from_datagouvfr(importmetadata, LOCAL_FOLDER['project_home_path']+DATAGOUVFR_SOURCE_METADATA['collected_files_folder'], engine='c', startindex=0) if EXECUTION_PATH['task'] == 'Test': #****************************************************** # THIS PART CRASHES #****************************************************** testdatacollector = DataCollector(DATAGOUVFR_SOURCE_METADATA) #testdatacollector.generate_basebook_from_datagouvfr_page(DATAGOUVFR_SOURCE_METADATA['core_url'], 0, 2, 1) #testdatacollector.parse_all_basebooks_from_datagouvfr_page(DATAGOUVFR_SOURCE_METADATA['core_url'], 0, outputdirectory=DATAGOUVFR_SOURCE_METADATA['collected_files_folder']) testdatacollector.parse_all_pages_from_datagouvfr(DATAGOUVFR_SOURCE_METADATA['core_url'], 0, 12, outputdirectory=DATAGOUVFR_SOURCE_METADATA['collected_files_folder'], cleanup=True, removeduplicates=True) #****************************************************** # THIS PART DOES NOT CRASH #****************************************************** # testdataframe = pd.DataFrame() # testdict=dict() # testdataframe= pd.read_csv(LOCAL_FOLDER['temporary_files'] + 'temporaryfile2018-12-24 14:23 65.json', delimiter=';', encoding="latin-1") # print('Test dataframe: ') # print(testdataframe) # print('Removing duplicates:') #testdataframe.drop_duplicates() #testbbdf = BasebookDataFrame() #testbbdf.import_dataframe(testdataframe, cleanup=True, removeduplicates=True) #testbb = BasebookDataContainer() #testbb.import_dataframe(testdataframe, cleanupdata=True, removeduplicates=True) #testbb.display() #inputurl='http://static.data.gouv.fr/e0/bde1269429b21724f7294f4c6d45bc9961d05daead238123956de450d93457.csv' #urllib.request.urlretrieve(inputurl, 'debugfile.csv') now = datetime.now() print('********************** TASK COMPLETED ', now.strftime("%Y-%m-%d %Hh%M"), '***********************\n\n') </len(encodinglist))):>