import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# Load the data
= pd.read_csv('EduDataset.csv')
df1 3)
df1.head(print(df1.dtypes)
county object
Expeditures for Public Elementary and Secondary Schools per Pupil object
Income by School County object
Sum of Number Students int64
Count of Average Standard Score (2020-21)1 int64
Count of Percent African American1 int64
Average of Number Full-time Teachers float64
Average of PercentAmericanIndian1 object
Average of PercentAsian1 object
Average of PercentHispanic1 object
Average of PercentPacificIslander1 object
Average of PercentTwoorMoreRaces1 object
Average of PercentWhite1 object
Average_Standard _Score_(2021-22) float64
Average of Rank (2020-21) int64
Average of Rank Change from (2020-21) int64
Average of SchoolDigger Star Rating float64
Average of SchoolDigger Star Rating11 float64
dtype: object
'Income by School County'] = df1['Income by School County'].str.replace(',', '')
df1['Income by School County'] = pd.to_numeric(df1['Income by School County']) df1[
'Expeditures for Public Elementary and Secondary Schools per Pupil'] = df1['Expeditures for Public Elementary and Secondary Schools per Pupil'].str.replace(',', '')
df1['Expeditures for Public Elementary and Secondary Schools per Pupil'] = pd.to_numeric(df1['Expeditures for Public Elementary and Secondary Schools per Pupil']) df1[
# Load the data
= pd.read_csv('EduDataset.csv')
df_EduDataset3) df_EduDataset.head(
county | Expeditures for Public Elementary and Secondary Schools per Pupil | Income by School County | Sum of Number Students | Count of Average Standard Score (2020-21)1 | Count of Percent African American1 | Average of Number Full-time Teachers | Average of PercentAmericanIndian1 | Average of PercentAsian1 | Average of PercentHispanic1 | Average of PercentPacificIslander1 | Average of PercentTwoorMoreRaces1 | Average of PercentWhite1 | Average_Standard _Score_(2021-22) | Average of Rank (2020-21) | Average of Rank Change from (2020-21) | Average of SchoolDigger Star Rating | Average of SchoolDigger Star Rating11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Surry County | 20,729 | 56,525 | 240 | 1 | 1 | 23.70 | 0.00% | 0.00% | 1.30% | 0.80% | 6.30% | 42.10% | 32.30 | 880 | 60 | 1.0 | 1.0 |
1 | Arlington County | 19,293 | 122,604 | 12663 | 24 | 24 | 34.23 | 0.14% | 9.30% | 28.38% | 0.01% | 8.45% | 42.64% | 63.63 | 398 | -8 | 3.2 | 3.2 |
2 | Falls Church city | 19,262 | 146,922 | 518 | 1 | 1 | 28.00 | 0.00% | 8.50% | 12.00% | 0.00% | 11.00% | 65.30% | 90.10 | 24 | -28 | 5.0 | 5.0 |
# Load the county shapefile for Virginia
= gpd.read_file('VirginiaCounty.shp')
counties 30)
counties.head( counties.info()
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 133 entries, 0 to 132
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 STCOFIPS 133 non-null object
1 GNIS 133 non-null object
2 NAME 133 non-null object
3 NAMELSAD 133 non-null object
4 GSOURCE 133 non-null object
5 LADOPT 133 non-null object
6 AREASQMI 133 non-null float64
7 LASTUPDATE 130 non-null object
8 JURISTYPE 133 non-null object
9 geometry 133 non-null geometry
dtypes: float64(1), geometry(1), object(8)
memory usage: 10.5+ KB
# join the income data with the shapefile by county name
= counties.merge(df1, left_on='NAMELSAD', right_on='county')
gdf_counties_income 3) gdf_counties_income.head(
STCOFIPS | GNIS | NAME | NAMELSAD | GSOURCE | LADOPT | AREASQMI | LASTUPDATE | JURISTYPE | geometry | ... | Average of PercentAsian1 | Average of PercentHispanic1 | Average of PercentPacificIslander1 | Average of PercentTwoorMoreRaces1 | Average of PercentWhite1 | Average_Standard _Score_(2021-22) | Average of Rank (2020-21) | Average of Rank Change from (2020-21) | Average of SchoolDigger Star Rating | Average of SchoolDigger Star Rating11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 51121 | 1674630 | Montgomery | Montgomery County | L | Y | 389.266823 | 2021-09-23 | CO | MULTIPOLYGON (((-72947.418 152168.110, -72250.... | ... | 4.67% | 7.17% | 0.05% | 6.69% | 77.19% | 68.30 | 418 | 58 | 3.3 | 3.3 |
1 | 51167 | 1497573 | Russell | Russell County | V | N | 476.490225 | 2014-09-24 | CO | POLYGON ((-203087.739 108906.044, -203120.476 ... | ... | 0.24% | 0.86% | 0.00% | 0.46% | 97.90% | 67.60 | 371 | 1 | 3.4 | 3.4 |
2 | 51089 | 1502770 | Henry | Henry County | T | N | 384.484406 | 2021-09-23 | CO | POLYGON ((-13013.107 92620.310, -13061.861 923... | ... | 0.29% | 15.11% | 0.07% | 9.57% | 51.54% | 42.96 | 785 | 98 | 1.8 | 1.8 |
3 rows × 28 columns
# create a choropleth map of income by county
= plt.subplots(1, 1)
fig, ax ='Income by School County', cmap='BuGn', ax=ax, legend=True, vmin=0, vmax=150000)
gdf_counties_income.plot(column'Income by County in Virginia', y=1.55)
ax.set_title( plt.show()
# create a choropleth map of Expeditures for Public Elementary and Secondary Schools per Pupil by county
= plt.subplots(1, 1)
fig, ax ='Expeditures for Public Elementary and Secondary Schools per Pupil',
gdf_counties_income.plot(column='BuGn', ax=ax, legend=True, vmin=0, vmax=22000)
cmap'Expeditures for Public Elementary and Secondary Schools per Pupil', y=1.55)
ax.set_title( plt.show()
# compute summary statistics for the income column
= df1['Income by School County'].describe()
income_summary
# print the income summary table
print(income_summary)
count 130
unique 130
top 56,525
freq 1
Name: Income by School County, dtype: object
# compute summary statistics for the Expeditures for Public Elementary and Secondary Schools per Pupil column
= df1['Expeditures for Public Elementary and Secondary Schools per Pupil'].describe()
Expeditures_summary
# print the income summary table
print(Expeditures_summary)
count 130
unique 126
top 11,978
freq 2
Name: Expeditures for Public Elementary and Secondary Schools per Pupil, dtype: object
# join last datset by county name
= gdf_counties_income.merge(df_EduDataset, left_on='NAMELSAD', right_on='county')
educationQualityDataset educationQualityDataset.head()
STATEFP | COUNTYFP | COUSUBFP | COUSUBNS | GEOID | NAME | NAMELSAD | LSAD | CLASSFP | MTFCC | ... | Average of PercentAsian1_y | Average of PercentHispanic1_y | Average of PercentPacificIslander1_y | Average of PercentTwoorMoreRaces1_y | Average of PercentWhite1_y | Average_Standard _Score_(2021-22)_y | Average of Rank (2020-21)_y | Average of Rank Change from (2020-21)_y | Average of SchoolDigger Star Rating_y | Average of SchoolDigger Star Rating11_y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 51 | 570 | 90948 | 01498420 | 5157090948 | Colonial Heights | Colonial Heights city | 25 | Z7 | G4040 | ... | 2.90% | 15.43% | 0.00% | 10.87% | 44.77% | 45.83 | 716 | 63 | 1.7 | 1.7 |
1 | 51 | 790 | 95875 | 01789075 | 5179095875 | Staunton | Staunton city | 25 | Z7 | G4040 | ... | 1.40% | 7.10% | 0.07% | 14.60% | 62.70% | 55.03 | 554 | 20 | 2.7 | 2.7 |
2 | 51 | 540 | 90780 | 01789068 | 5154090780 | Charlottesville | Charlottesville city | 25 | Z7 | G4040 | ... | 5.60% | 10.98% | 0.00% | 12.73% | 41.92% | 40.97 | 645 | -52 | 1.8 | 1.8 |
3 | 51 | 700 | 94851 | 01498555 | 5170094851 | Newport News | Newport News city | 25 | Z7 | G4040 | ... | 1.86% | 16.26% | 0.32% | 8.61% | 21.36% | 21.77 | 915 | 13 | 0.8 | 0.8 |
4 | 51 | 735 | 95115 | 01498436 | 5173595115 | Poquoson | Poquoson city | 25 | Z7 | G4040 | ... | 0.90% | 4.90% | 0.00% | 2.90% | 89.40% | 83.10 | 31 | -133 | 4.0 | 4.0 |
5 rows × 55 columns
# Load the data
= pd.read_csv('EduDataset.csv')
EduData 3)
EduData.head( EduData.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 county 130 non-null object
1 Expeditures for Public Elementary and Secondary Schools per Pupil 130 non-null object
2 Income by School County 130 non-null object
3 Sum of Number Students 130 non-null int64
4 Count of Average Standard Score (2020-21)1 130 non-null int64
5 Count of Percent African American1 130 non-null int64
6 Average of Number Full-time Teachers 130 non-null float64
7 Average of PercentAmericanIndian1 130 non-null object
8 Average of PercentAsian1 130 non-null object
9 Average of PercentHispanic1 130 non-null object
10 Average of PercentPacificIslander1 130 non-null object
11 Average of PercentTwoorMoreRaces1 130 non-null object
12 Average of PercentWhite1 130 non-null object
13 Average_Standard _Score_(2021-22) 130 non-null float64
14 Average of Rank (2020-21) 130 non-null int64
15 Average of Rank Change from (2020-21) 130 non-null int64
16 Average of SchoolDigger Star Rating 130 non-null float64
17 Average of SchoolDigger Star Rating11 130 non-null float64
dtypes: float64(4), int64(5), object(9)
memory usage: 18.4+ KB
print(EduData.dtypes)
county object
Expeditures for Public Elementary and Secondary Schools per Pupil object
Income by School County object
Sum of Number Students int64
Count of Average Standard Score (2020-21)1 int64
Count of Percent African American1 int64
Average of Number Full-time Teachers float64
Average of PercentAmericanIndian1 object
Average of PercentAsian1 object
Average of PercentHispanic1 object
Average of PercentPacificIslander1 object
Average of PercentTwoorMoreRaces1 object
Average of PercentWhite1 object
Average_Standard _Score_(2021-22) float64
Average of Rank (2020-21) int64
Average of Rank Change from (2020-21) int64
Average of SchoolDigger Star Rating float64
Average of SchoolDigger Star Rating11 float64
dtype: object
'Income by School County'] = EduData['Income by School County'].str.replace(',', '').astype(int) EduData[
'Average of Number Full-time Teachers'] = EduData['Average of Number Full-time Teachers'].replace(',', '').astype(int) EduData[
import seaborn as sns
import matplotlib.pyplot as plt
='Income by School County', y='Expeditures for Public Elementary and Secondary Schools per Pupil', data=EduData)
sns.regplot(x'Correlation between Income by School County and Expeditures per Pupil')
plt.title('Income by School County')
plt.xlabel('Expeditures per Pupil')
plt.ylabel( plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
='Expeditures for Public Elementary and Secondary Schools per Pupil', y='Average of SchoolDigger Star Rating', data=EduData)
sns.regplot(x'SchoolDigger Star Rating and Expeditures per Pupil')
plt.title('Expeditures per Pupil')
plt.xlabel('SchoolDigger Star Rating')
plt.ylabel( plt.show()
='Income by School County', y='Average_Standard _Score_(2021-22)', data=EduData)
sns.regplot(x'Average Standard Score and Income by School County')
plt.title('Income by School County')
plt.xlabel('Average Standard Score')
plt.ylabel( plt.show()
='Income by School County', y='Average_Standard _Score_(2021-22)', data=EduData)
sns.regplot(x'Average Standard Score and Income by School County')
plt.title('Income by School County')
plt.xlabel('Average Standard Score')
plt.ylabel( plt.show()
# Select the columns to use for clustering
= EduData[['Income by School County', 'Average_Standard _Score_(2021-22)']]
X
# Perform k-means clustering with 3 clusters
= KMeans(n_clusters=3, random_state=0).fit(X)
kmeans
# Add the cluster labels to the data frame
'Cluster'] = kmeans.labels_
EduData[
# Plot the clusters
='Income by School County', y='Average_Standard _Score_(2021-22)', hue='Cluster', data=EduData)
sns.scatterplot(x'Clustering of Education Data')
plt.title('Income by School County')
plt.xlabel('Average Standard Score')
plt.ylabel( plt.show()
# Set the color and shape for each cluster
= ["blue", "green", "red"]
colors = ["o", "s", "d"]
shapes
# Select the columns to use for clustering
= EduData[['Income by School County', 'Average_Standard _Score_(2021-22)']]
X
# Perform k-means clustering with 3 clusters
= KMeans(n_clusters=3, random_state=0).fit(X)
kmeans
# Add the cluster labels to the data frame
'Cluster'] = kmeans.labels_
EduData[
# Plot the clusters with different colors and shapes for each cluster
='Income by School County', y='Average_Standard _Score_(2021-22)', hue='Cluster', style='Cluster', palette=colors, markers=shapes, data=EduData)
sns.scatterplot(x'Clustering of Education Data')
plt.title('Income by School County')
plt.xlabel('Average Standard Score')
plt.ylabel( plt.show()