import sqlite3
import pandas as pd
= sqlite3.connect('D:/data/sqlite3/FinalDB.db') conn
Load Data into DFs
CSV to SQLite3
Data
In this project we’ll study 3 datasets regarding the city of Chicago.
1. Socioeconomic Indicators in Chicago
This dataset contains a selection of six socioeconomic indicators of public health significance and a “hardship index,” for each Chicago community area, for the years 2008 – 2012. A detailed description of this dataset and the original dataset can be obtained from the Chicago Data Portal at:
2. Chicago Public Schools
This dataset shows all school level performance data used to create CPS School Report Cards for the 2011-2012 school year. This dataset is provided by the city of Chicago’s Data Portal.
A detailed description of this dataset and the original dataset can be obtained from the Chicago Data Portal at:
3. Chicago Crime Data
This dataset reflects reported incidents of crime (with the exception of murders where data exists for each victim) that occurred in the City of Chicago from 2001 to present, minus the most recent seven days.
A detailed description of this dataset and the original dataset can be obtained from the Chicago Data Portal at:
https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present/ijzp-q8t2
Download Data
We will populate the three tables with a subset of the whole datasets. The subset have been provided here and are listed below.
Create DB
# Set file paths
= 'D:/data/sqlite3/ChicagoCensusData.csv'
census_path = 'D:/data/sqlite3/ChicagoPublicSchools.csv'
school_path = 'D:/data/sqlite3/ChicagoCrimeData.csv'
crime_path = pd.read_csv(census_path)
df_census = pd.read_csv(school_path)
df_school = pd.read_csv(crime_path) df_crime
Load DFs into Tables
'CENSUS_DATA',conn, if_exists = 'replace', index = False)
df_census.to_sql('CHICAGO_PUBLIC_SCHOOLS',conn, if_exists = 'replace', index = False)
df_school.to_sql('CHICAGO_CRIME_DATA',conn, if_exists = 'replace', index = False) df_crime.to_sql(
533
Find Total # of Crimes
= conn.cursor() curs
#Find the total number of crimes recorded in the CRIME table
= '''
count_crime SELECT COUNT(*)
FROM CHICAGO_CRIME_DATA'''
curs.execute(count_crime)print("Total Number of Crime is: ")
= curs.fetchall()
output_all for row_all in output_all:
print(row_all)
Total Number of Crime is:
(533,)
Community Areas < 11000
- My guess this will be in CENSUS_DATA
- Find community name and numbers with per capita income less than 11000
#List community area names and numbers with per capita income less than 11000
= '''
query2 SELECT COMMUNITY_AREA_NUMBER, COMMUNITY_AREA_NAME
FROM CENSUS_DATA
WHERE PER_CAPITA_INCOME < 11000'''
curs.execute(query2)print("Communities under 11000/capita ")
= curs.fetchall()
output_all for row_all in output_all:
print(row_all)
Communities under 11000/capita
(26.0, 'West Garfield Park')
(30.0, 'South Lawndale')
(37.0, 'Fuller Park')
(54.0, 'Riverdale')
Crimes with Minors
# List all case numbers for crimes involving minors?(children are not considered minors for the purposes of crime analysis - I added description col as well
= """
q2 SELECT CASE_NUMBER, DESCRIPTION
FROM CHICAGO_CRIME_DATA
WHERE DESCRIPTION LIKE '%MINOR%'"""
curs.execute(q2)print("Cases involving minors")
= curs.fetchall()
output_all for row_all in output_all:
print(row_all)
Cases involving minors
('HL266884', 'SELL/GIVE/DEL LIQUOR TO MINOR')
('HK238408', 'ILLEGAL CONSUMPTION BY MINOR')
Kidnapping involving a child
#List all kidnapping crimes involving a child?
= '''
q3 SELECT *
FROM CHICAGO_CRIME_DATA
WHERE PRIMARY_TYPE = "KIDNAPPING" AND DESCRIPTION LIKE "%CHILD%"'''
curs.execute(q3)print("Kidnapping involving a child")
= curs.fetchall()
output_all for row_all in output_all:
print(row_all)
Kidnapping involving a child
(5276766, 'HN144152', '2007-01-26', '050XX W VAN BUREN ST', '1792', 'KIDNAPPING', 'CHILD ABDUCTION/STRANGER', 'STREET', 0, 0, 1533, 15, 29.0, 25.0, '20', 1143050.0, 1897546.0, 2007, 41.87490841, -87.75024931, '(41.874908413, -87.750249307)')
Crimes Recorded at Schools
# this is for all crimes occurring at schools
= """
q4 SELECT PRIMARY_TYPE
FROM CHICAGO_CRIME_DATA
WHERE LOCATION_DESCRIPTION LIKE 'SCHOOL%'"""
curs.execute(q4)print("All crimes at schools")
= curs.fetchall()
output_all for row_all in output_all:
print(row_all)
All crimes at schools
('BATTERY',)
('BATTERY',)
('BATTERY',)
('BATTERY',)
('BATTERY',)
('CRIMINAL DAMAGE',)
('NARCOTICS',)
('NARCOTICS',)
('ASSAULT',)
('CRIMINAL TRESPASS',)
('PUBLIC PEACE VIOLATION',)
('PUBLIC PEACE VIOLATION',)
# this is for distinct crimes at schools
= """
q5 SELECT DISTINCT(PRIMARY_TYPE)
FROM CHICAGO_CRIME_DATA
WHERE LOCATION_DESCRIPTION LIKE 'SCHOOL%'"""
curs.execute(q5)print("All DISTINCT crimes at schools")
= curs.fetchall()
output_all for row_all in output_all:
print(row_all)
All DISTINCT crimes at schools
('BATTERY',)
('CRIMINAL DAMAGE',)
('NARCOTICS',)
('ASSAULT',)
('CRIMINAL TRESPASS',)
('PUBLIC PEACE VIOLATION',)
School Type Safety Score
# List the type of schools along with the average safety score for each type
= '''
q6 SELECT `Elementary, Middle, or High School` AS School_type, AVG(SAFETY_SCORE)
FROM CHICAGO_PUBLIC_SCHOOLS
GROUP BY School_type'''
curs.execute(q6)print("School Types and Average Safety Scores")
= curs.fetchall()
output_all for row_all in output_all:
print(row_all)
School Types and Average Safety Scores
('ES', 49.52038369304557)
('HS', 49.62352941176471)
('MS', 48.0)
Highest % Below Poverty Line
# List 5 community areas with highest % of households below poverty line
= '''
q7 SELECT COMMUNITY_AREA_NAME, PERCENT_HOUSEHOLDS_BELOW_POVERTY AS MARKER
FROM CENSUS_DATA
ORDER BY MARKER DESC
LIMIT 5'''
curs.execute(q7)print("Top 5 areas with highest % below poverty line")
= curs.fetchall()
output_all for row_all in output_all:
print(row_all)
"""could also use
SELECT COMMUNITY_AREA_NAME
FROM CENSUS_DATA
ORDER BY PERCENT_HOUSEHOLDS_BELOW_POVERTY DESC LIMIT 5"""
Top 5 areas with highest % below poverty line
('Riverdale', 56.5)
('Fuller Park', 51.2)
('Englewood', 46.6)
('North Lawndale', 43.1)
('East Garfield Park', 42.4)
'could also use\nSELECT COMMUNITY_AREA_NAME\nFROM CENSUS_DATA\nORDER BY PERCENT_HOUSEHOLDS_BELOW_POVERTY DESC LIMIT 5'
Most Crime Prone Community
# Which community area is most crime prone? Display the coumminty area number only.
= '''
q8 SELECT COMMUNITY_AREA_NUMBER, COUNT(CASE_NUMBER) AS TOTAL
FROM CHICAGO_CRIME_DATA
GROUP BY COMMUNITY_AREA_NUMBER
ORDER BY TOTAL DESC
LIMIT 1'''
curs.execute(q8)print("Most Crime Prone Comunity Area Number")
= curs.fetchall()
output_all for row_all in output_all:
print(row_all)
Most Crime Prone Comunity Area Number
(25.0, 43)
# SAME as above but display the Community Area Number only
= '''
q8b SELECT COMMUNITY_AREA_NUMBER
FROM CHICAGO_CRIME_DATA
GROUP BY COMMUNITY_AREA_NUMBER
ORDER BY COUNT(CASE_NUMBER) DESC
LIMIT 1'''
curs.execute(q8b)print("Most Crime Prone Comunity Area Number")
= curs.fetchall()
output_all for row_all in output_all:
print(row_all)
Most Crime Prone Comunity Area Number
(25.0,)
Community with Highest Hardship Index
# Use a sub-query to find the name of the community area with highest hardship index
= '''
q9 SELECT COMMUNITY_AREA_NAME, MAX(HARDSHIP_INDEX)
FROM CENSUS_DATA'''
curs.execute(q9)print("Community Area with Highest Harship Index")
= curs.fetchall()
output_all for row_all in output_all:
print(row_all)
Community Area with Highest Harship Index
('Riverdale', 98.0)
Community Area Name Most Crimes
# Use a sub-query to determine the Community Area Name with most number of crimes
# we already have the community area number from 8 which is from table CHICAGO_CRIME_DATA, now we have to find the name from table CENSUS_DATA
# I added , COMMUNITY_AREA_NUMBER to verify the result with q8 above
= '''
q10 SELECT COMMUNITY_AREA_NAME, COMMUNITY_AREA_NUMBER
FROM CENSUS_DATA
WHERE COMMUNITY_AREA_NUMBER IN (
SELECT COMMUNITY_AREA_NUMBER
FROM CHICAGO_CRIME_DATA
GROUP BY COMMUNITY_AREA_NUMBER
ORDER BY COUNT(CASE_NUMBER) DESC
LIMIT 1
)'''
curs.execute(q10)print("Community Area with Highest Harship Index")
= curs.fetchall()
output_all for row_all in output_all:
print(row_all)
Community Area with Highest Harship Index
('Austin', 25.0)
for q10
Could use this setup
SELECT COMMUNITY_AREA_NAME
FROM CENSUS_DATA
WHERE HARDSHIP_INDEX IN
( SELECT MAX(HARDSHIP_INDEX) FROM CENSUS_DATA)