CSV to SQLite3

Data


In this project we’ll study 3 datasets regarding the city of Chicago.

1. Socioeconomic Indicators in Chicago

This dataset contains a selection of six socioeconomic indicators of public health significance and a “hardship index,” for each Chicago community area, for the years 2008 – 2012. A detailed description of this dataset and the original dataset can be obtained from the Chicago Data Portal at:

https://data.cityofchicago.org/Health-Human-Services/Census-Data-Selected-socioeconomic-indicators-in-C/kn9c-c2s2

2. Chicago Public Schools

This dataset shows all school level performance data used to create CPS School Report Cards for the 2011-2012 school year. This dataset is provided by the city of Chicago’s Data Portal.

A detailed description of this dataset and the original dataset can be obtained from the Chicago Data Portal at:

https://data.cityofchicago.org/Education/Chicago-Public-Schools-Progress-Report-Cards-2011-/9xs2-f89t

3. Chicago Crime Data

This dataset reflects reported incidents of crime (with the exception of murders where data exists for each victim) that occurred in the City of Chicago from 2001 to present, minus the most recent seven days.

A detailed description of this dataset and the original dataset can be obtained from the Chicago Data Portal at:

https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present/ijzp-q8t2

Download Data


We will populate the three tables with a subset of the whole datasets. The subset have been provided here and are listed below.

Create DB


import sqlite3
import pandas as pd
conn = sqlite3.connect('D:/data/sqlite3/FinalDB.db')

Load Data into DFs


# Set file paths
census_path = 'D:/data/sqlite3/ChicagoCensusData.csv'
school_path = 'D:/data/sqlite3/ChicagoPublicSchools.csv'
crime_path = 'D:/data/sqlite3/ChicagoCrimeData.csv'
df_census = pd.read_csv(census_path)
df_school = pd.read_csv(school_path)
df_crime = pd.read_csv(crime_path)

Load DFs into Tables


df_census.to_sql('CENSUS_DATA',conn, if_exists = 'replace', index = False)
df_school.to_sql('CHICAGO_PUBLIC_SCHOOLS',conn, if_exists = 'replace', index = False)
df_crime.to_sql('CHICAGO_CRIME_DATA',conn, if_exists = 'replace', index = False)
533

Find Total # of Crimes

curs = conn.cursor()
#Find the total number of crimes recorded in the CRIME table
count_crime = '''
        SELECT COUNT(*)
        FROM   CHICAGO_CRIME_DATA'''
curs.execute(count_crime)
print("Total Number of Crime is: ")
output_all = curs.fetchall()
for row_all in output_all:
  print(row_all)
Total Number of Crime is: 
(533,)

Community Areas < 11000

  • My guess this will be in CENSUS_DATA
  • Find community name and numbers with per capita income less than 11000
#List community area names and numbers with per capita income less than 11000
query2 = '''
        SELECT COMMUNITY_AREA_NUMBER, COMMUNITY_AREA_NAME
        FROM   CENSUS_DATA
        WHERE  PER_CAPITA_INCOME < 11000'''
curs.execute(query2)
print("Communities under 11000/capita ")
output_all = curs.fetchall()
for row_all in output_all:
  print(row_all)
Communities under 11000/capita 
(26.0, 'West Garfield Park')
(30.0, 'South Lawndale')
(37.0, 'Fuller Park')
(54.0, 'Riverdale')

Crimes with Minors

# List all case numbers for crimes involving minors?(children are not considered minors for the purposes of crime analysis - I added description col as well
q2 = """
        SELECT CASE_NUMBER, DESCRIPTION
        FROM   CHICAGO_CRIME_DATA
        WHERE DESCRIPTION LIKE '%MINOR%'"""
curs.execute(q2)
print("Cases involving minors")
output_all = curs.fetchall()
for row_all in output_all:
  print(row_all) 
Cases involving minors
('HL266884', 'SELL/GIVE/DEL LIQUOR TO MINOR')
('HK238408', 'ILLEGAL CONSUMPTION BY MINOR')

Kidnapping involving a child

#List all kidnapping crimes involving a child?
q3 = '''
        SELECT *
        FROM   CHICAGO_CRIME_DATA
        WHERE  PRIMARY_TYPE = "KIDNAPPING" AND DESCRIPTION LIKE "%CHILD%"'''
curs.execute(q3)
print("Kidnapping involving a child")
output_all = curs.fetchall()
for row_all in output_all:
  print(row_all)       
Kidnapping involving a child
(5276766, 'HN144152', '2007-01-26', '050XX W VAN BUREN ST', '1792', 'KIDNAPPING', 'CHILD ABDUCTION/STRANGER', 'STREET', 0, 0, 1533, 15, 29.0, 25.0, '20', 1143050.0, 1897546.0, 2007, 41.87490841, -87.75024931, '(41.874908413, -87.750249307)')

Crimes Recorded at Schools

# this is for all crimes occurring at schools
q4 = """
        SELECT PRIMARY_TYPE
        FROM   CHICAGO_CRIME_DATA
        WHERE  LOCATION_DESCRIPTION LIKE 'SCHOOL%'"""
curs.execute(q4)
print("All crimes at schools")
output_all = curs.fetchall()
for row_all in output_all:
  print(row_all)        
All crimes at schools
('BATTERY',)
('BATTERY',)
('BATTERY',)
('BATTERY',)
('BATTERY',)
('CRIMINAL DAMAGE',)
('NARCOTICS',)
('NARCOTICS',)
('ASSAULT',)
('CRIMINAL TRESPASS',)
('PUBLIC PEACE VIOLATION',)
('PUBLIC PEACE VIOLATION',)
# this is for distinct crimes at schools
q5 = """
        SELECT DISTINCT(PRIMARY_TYPE)
        FROM   CHICAGO_CRIME_DATA
        WHERE  LOCATION_DESCRIPTION LIKE 'SCHOOL%'"""
curs.execute(q5)
print("All DISTINCT crimes at schools")
output_all = curs.fetchall()
for row_all in output_all:
  print(row_all) 
All DISTINCT crimes at schools
('BATTERY',)
('CRIMINAL DAMAGE',)
('NARCOTICS',)
('ASSAULT',)
('CRIMINAL TRESPASS',)
('PUBLIC PEACE VIOLATION',)

School Type Safety Score

# List the type of schools along with the average safety score for each type
q6 = '''
        SELECT    `Elementary, Middle, or High School` AS School_type, AVG(SAFETY_SCORE)
        FROM      CHICAGO_PUBLIC_SCHOOLS
        GROUP BY  School_type'''
curs.execute(q6)
print("School Types and Average Safety Scores")
output_all = curs.fetchall()
for row_all in output_all:
  print(row_all) 
School Types and Average Safety Scores
('ES', 49.52038369304557)
('HS', 49.62352941176471)
('MS', 48.0)

Highest % Below Poverty Line

# List 5 community areas with highest % of households below poverty line
q7 = '''
        SELECT   COMMUNITY_AREA_NAME, PERCENT_HOUSEHOLDS_BELOW_POVERTY AS MARKER
        FROM     CENSUS_DATA
        ORDER BY MARKER DESC
        LIMIT    5'''
curs.execute(q7)
print("Top 5 areas with highest % below poverty line")
output_all = curs.fetchall()
for row_all in output_all:
  print(row_all)   
  
"""could also use
SELECT   COMMUNITY_AREA_NAME
FROM     CENSUS_DATA
ORDER BY PERCENT_HOUSEHOLDS_BELOW_POVERTY DESC LIMIT 5"""
Top 5 areas with highest % below poverty line
('Riverdale', 56.5)
('Fuller Park', 51.2)
('Englewood', 46.6)
('North Lawndale', 43.1)
('East Garfield Park', 42.4)
'could also use\nSELECT   COMMUNITY_AREA_NAME\nFROM     CENSUS_DATA\nORDER BY PERCENT_HOUSEHOLDS_BELOW_POVERTY DESC LIMIT 5'

Most Crime Prone Community

# Which community area is most crime prone? Display the coumminty area number only.
q8 = '''
        SELECT    COMMUNITY_AREA_NUMBER, COUNT(CASE_NUMBER) AS TOTAL
        FROM      CHICAGO_CRIME_DATA
        GROUP BY  COMMUNITY_AREA_NUMBER
        ORDER BY  TOTAL DESC
        LIMIT     1'''
curs.execute(q8)
print("Most Crime Prone Comunity Area Number")
output_all = curs.fetchall()
for row_all in output_all:
  print(row_all)         
Most Crime Prone Comunity Area Number
(25.0, 43)
# SAME as above but display the Community Area Number only
q8b = '''
        SELECT    COMMUNITY_AREA_NUMBER
        FROM      CHICAGO_CRIME_DATA
        GROUP BY  COMMUNITY_AREA_NUMBER
        ORDER BY  COUNT(CASE_NUMBER) DESC
        LIMIT     1'''
curs.execute(q8b)
print("Most Crime Prone Comunity Area Number")
output_all = curs.fetchall()
for row_all in output_all:
  print(row_all)        
Most Crime Prone Comunity Area Number
(25.0,)

Community with Highest Hardship Index

# Use a sub-query to find the name of the community area with highest hardship index
q9 = '''
        SELECT   COMMUNITY_AREA_NAME, MAX(HARDSHIP_INDEX)
        FROM     CENSUS_DATA'''
curs.execute(q9)
print("Community Area with Highest Harship Index")
output_all = curs.fetchall()
for row_all in output_all:
  print(row_all)         
Community Area with Highest Harship Index
('Riverdale', 98.0)

Community Area Name Most Crimes

# Use a sub-query to determine the Community Area Name with most number of crimes
# we already have the community area number from 8 which is from table CHICAGO_CRIME_DATA, now we have to find the name from table CENSUS_DATA

# I added , COMMUNITY_AREA_NUMBER to verify the result with q8 above
q10 = '''
        SELECT  COMMUNITY_AREA_NAME, COMMUNITY_AREA_NUMBER 
        FROM    CENSUS_DATA
        WHERE   COMMUNITY_AREA_NUMBER IN (
                SELECT    COMMUNITY_AREA_NUMBER
                FROM      CHICAGO_CRIME_DATA
                GROUP BY  COMMUNITY_AREA_NUMBER
                ORDER BY  COUNT(CASE_NUMBER) DESC
                LIMIT     1
                )'''
curs.execute(q10)
print("Community Area with Highest Harship Index")
output_all = curs.fetchall()
for row_all in output_all:
  print(row_all)
Community Area with Highest Harship Index
('Austin', 25.0)
Could use this setup for q10
SELECT   COMMUNITY_AREA_NAME
FROM     CENSUS_DATA
WHERE    HARDSHIP_INDEX IN
                 ( SELECT  MAX(HARDSHIP_INDEX)
                   FROM    CENSUS_DATA)