import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans# Load the data
df1 = pd.read_csv('EduDataset.csv')
df1.head(3)
print(df1.dtypes)county object
Expeditures for Public Elementary and Secondary Schools per Pupil object
Income by School County object
Sum of Number Students int64
Count of Average Standard Score (2020-21)1 int64
Count of Percent African American1 int64
Average of Number Full-time Teachers float64
Average of PercentAmericanIndian1 object
Average of PercentAsian1 object
Average of PercentHispanic1 object
Average of PercentPacificIslander1 object
Average of PercentTwoorMoreRaces1 object
Average of PercentWhite1 object
Average_Standard _Score_(2021-22) float64
Average of Rank (2020-21) int64
Average of Rank Change from (2020-21) int64
Average of SchoolDigger Star Rating float64
Average of SchoolDigger Star Rating11 float64
dtype: object
df1['Income by School County'] = df1['Income by School County'].str.replace(',', '')
df1['Income by School County'] = pd.to_numeric(df1['Income by School County'])df1['Expeditures for Public Elementary and Secondary Schools per Pupil'] = df1['Expeditures for Public Elementary and Secondary Schools per Pupil'].str.replace(',', '')
df1['Expeditures for Public Elementary and Secondary Schools per Pupil'] = pd.to_numeric(df1['Expeditures for Public Elementary and Secondary Schools per Pupil'])# Load the data
df_EduDataset= pd.read_csv('EduDataset.csv')
df_EduDataset.head(3)| county | Expeditures for Public Elementary and Secondary Schools per Pupil | Income by School County | Sum of Number Students | Count of Average Standard Score (2020-21)1 | Count of Percent African American1 | Average of Number Full-time Teachers | Average of PercentAmericanIndian1 | Average of PercentAsian1 | Average of PercentHispanic1 | Average of PercentPacificIslander1 | Average of PercentTwoorMoreRaces1 | Average of PercentWhite1 | Average_Standard _Score_(2021-22) | Average of Rank (2020-21) | Average of Rank Change from (2020-21) | Average of SchoolDigger Star Rating | Average of SchoolDigger Star Rating11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Surry County | 20,729 | 56,525 | 240 | 1 | 1 | 23.70 | 0.00% | 0.00% | 1.30% | 0.80% | 6.30% | 42.10% | 32.30 | 880 | 60 | 1.0 | 1.0 |
| 1 | Arlington County | 19,293 | 122,604 | 12663 | 24 | 24 | 34.23 | 0.14% | 9.30% | 28.38% | 0.01% | 8.45% | 42.64% | 63.63 | 398 | -8 | 3.2 | 3.2 |
| 2 | Falls Church city | 19,262 | 146,922 | 518 | 1 | 1 | 28.00 | 0.00% | 8.50% | 12.00% | 0.00% | 11.00% | 65.30% | 90.10 | 24 | -28 | 5.0 | 5.0 |
# Load the county shapefile for Virginia
counties = gpd.read_file('VirginiaCounty.shp')
counties.head(30)
counties.info()<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 133 entries, 0 to 132
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 STCOFIPS 133 non-null object
1 GNIS 133 non-null object
2 NAME 133 non-null object
3 NAMELSAD 133 non-null object
4 GSOURCE 133 non-null object
5 LADOPT 133 non-null object
6 AREASQMI 133 non-null float64
7 LASTUPDATE 130 non-null object
8 JURISTYPE 133 non-null object
9 geometry 133 non-null geometry
dtypes: float64(1), geometry(1), object(8)
memory usage: 10.5+ KB
# join the income data with the shapefile by county name
gdf_counties_income = counties.merge(df1, left_on='NAMELSAD', right_on='county')
gdf_counties_income.head(3)| STCOFIPS | GNIS | NAME | NAMELSAD | GSOURCE | LADOPT | AREASQMI | LASTUPDATE | JURISTYPE | geometry | ... | Average of PercentAsian1 | Average of PercentHispanic1 | Average of PercentPacificIslander1 | Average of PercentTwoorMoreRaces1 | Average of PercentWhite1 | Average_Standard _Score_(2021-22) | Average of Rank (2020-21) | Average of Rank Change from (2020-21) | Average of SchoolDigger Star Rating | Average of SchoolDigger Star Rating11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 51121 | 1674630 | Montgomery | Montgomery County | L | Y | 389.266823 | 2021-09-23 | CO | MULTIPOLYGON (((-72947.418 152168.110, -72250.... | ... | 4.67% | 7.17% | 0.05% | 6.69% | 77.19% | 68.30 | 418 | 58 | 3.3 | 3.3 |
| 1 | 51167 | 1497573 | Russell | Russell County | V | N | 476.490225 | 2014-09-24 | CO | POLYGON ((-203087.739 108906.044, -203120.476 ... | ... | 0.24% | 0.86% | 0.00% | 0.46% | 97.90% | 67.60 | 371 | 1 | 3.4 | 3.4 |
| 2 | 51089 | 1502770 | Henry | Henry County | T | N | 384.484406 | 2021-09-23 | CO | POLYGON ((-13013.107 92620.310, -13061.861 923... | ... | 0.29% | 15.11% | 0.07% | 9.57% | 51.54% | 42.96 | 785 | 98 | 1.8 | 1.8 |
3 rows × 28 columns
# create a choropleth map of income by county
fig, ax = plt.subplots(1, 1)
gdf_counties_income.plot(column='Income by School County', cmap='BuGn', ax=ax, legend=True, vmin=0, vmax=150000)
ax.set_title('Income by County in Virginia', y=1.55)
plt.show()
# create a choropleth map of Expeditures for Public Elementary and Secondary Schools per Pupil by county
fig, ax = plt.subplots(1, 1)
gdf_counties_income.plot(column='Expeditures for Public Elementary and Secondary Schools per Pupil',
cmap='BuGn', ax=ax, legend=True, vmin=0, vmax=22000)
ax.set_title('Expeditures for Public Elementary and Secondary Schools per Pupil', y=1.55)
plt.show()
# compute summary statistics for the income column
income_summary = df1['Income by School County'].describe()
# print the income summary table
print(income_summary)count 130
unique 130
top 56,525
freq 1
Name: Income by School County, dtype: object
# compute summary statistics for the Expeditures for Public Elementary and Secondary Schools per Pupil column
Expeditures_summary = df1['Expeditures for Public Elementary and Secondary Schools per Pupil'].describe()
# print the income summary table
print(Expeditures_summary)count 130
unique 126
top 11,978
freq 2
Name: Expeditures for Public Elementary and Secondary Schools per Pupil, dtype: object
# join last datset by county name
educationQualityDataset = gdf_counties_income.merge(df_EduDataset, left_on='NAMELSAD', right_on='county')
educationQualityDataset.head()| STATEFP | COUNTYFP | COUSUBFP | COUSUBNS | GEOID | NAME | NAMELSAD | LSAD | CLASSFP | MTFCC | ... | Average of PercentAsian1_y | Average of PercentHispanic1_y | Average of PercentPacificIslander1_y | Average of PercentTwoorMoreRaces1_y | Average of PercentWhite1_y | Average_Standard _Score_(2021-22)_y | Average of Rank (2020-21)_y | Average of Rank Change from (2020-21)_y | Average of SchoolDigger Star Rating_y | Average of SchoolDigger Star Rating11_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 51 | 570 | 90948 | 01498420 | 5157090948 | Colonial Heights | Colonial Heights city | 25 | Z7 | G4040 | ... | 2.90% | 15.43% | 0.00% | 10.87% | 44.77% | 45.83 | 716 | 63 | 1.7 | 1.7 |
| 1 | 51 | 790 | 95875 | 01789075 | 5179095875 | Staunton | Staunton city | 25 | Z7 | G4040 | ... | 1.40% | 7.10% | 0.07% | 14.60% | 62.70% | 55.03 | 554 | 20 | 2.7 | 2.7 |
| 2 | 51 | 540 | 90780 | 01789068 | 5154090780 | Charlottesville | Charlottesville city | 25 | Z7 | G4040 | ... | 5.60% | 10.98% | 0.00% | 12.73% | 41.92% | 40.97 | 645 | -52 | 1.8 | 1.8 |
| 3 | 51 | 700 | 94851 | 01498555 | 5170094851 | Newport News | Newport News city | 25 | Z7 | G4040 | ... | 1.86% | 16.26% | 0.32% | 8.61% | 21.36% | 21.77 | 915 | 13 | 0.8 | 0.8 |
| 4 | 51 | 735 | 95115 | 01498436 | 5173595115 | Poquoson | Poquoson city | 25 | Z7 | G4040 | ... | 0.90% | 4.90% | 0.00% | 2.90% | 89.40% | 83.10 | 31 | -133 | 4.0 | 4.0 |
5 rows × 55 columns
# Load the data
EduData = pd.read_csv('EduDataset.csv')
EduData.head(3)
EduData.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 county 130 non-null object
1 Expeditures for Public Elementary and Secondary Schools per Pupil 130 non-null object
2 Income by School County 130 non-null object
3 Sum of Number Students 130 non-null int64
4 Count of Average Standard Score (2020-21)1 130 non-null int64
5 Count of Percent African American1 130 non-null int64
6 Average of Number Full-time Teachers 130 non-null float64
7 Average of PercentAmericanIndian1 130 non-null object
8 Average of PercentAsian1 130 non-null object
9 Average of PercentHispanic1 130 non-null object
10 Average of PercentPacificIslander1 130 non-null object
11 Average of PercentTwoorMoreRaces1 130 non-null object
12 Average of PercentWhite1 130 non-null object
13 Average_Standard _Score_(2021-22) 130 non-null float64
14 Average of Rank (2020-21) 130 non-null int64
15 Average of Rank Change from (2020-21) 130 non-null int64
16 Average of SchoolDigger Star Rating 130 non-null float64
17 Average of SchoolDigger Star Rating11 130 non-null float64
dtypes: float64(4), int64(5), object(9)
memory usage: 18.4+ KB
print(EduData.dtypes)county object
Expeditures for Public Elementary and Secondary Schools per Pupil object
Income by School County object
Sum of Number Students int64
Count of Average Standard Score (2020-21)1 int64
Count of Percent African American1 int64
Average of Number Full-time Teachers float64
Average of PercentAmericanIndian1 object
Average of PercentAsian1 object
Average of PercentHispanic1 object
Average of PercentPacificIslander1 object
Average of PercentTwoorMoreRaces1 object
Average of PercentWhite1 object
Average_Standard _Score_(2021-22) float64
Average of Rank (2020-21) int64
Average of Rank Change from (2020-21) int64
Average of SchoolDigger Star Rating float64
Average of SchoolDigger Star Rating11 float64
dtype: object
EduData['Income by School County'] = EduData['Income by School County'].str.replace(',', '').astype(int)EduData['Average of Number Full-time Teachers'] = EduData['Average of Number Full-time Teachers'].replace(',', '').astype(int)import seaborn as sns
import matplotlib.pyplot as plt
sns.regplot(x='Income by School County', y='Expeditures for Public Elementary and Secondary Schools per Pupil', data=EduData)
plt.title('Correlation between Income by School County and Expeditures per Pupil')
plt.xlabel('Income by School County')
plt.ylabel('Expeditures per Pupil')
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
sns.regplot(x='Expeditures for Public Elementary and Secondary Schools per Pupil', y='Average of SchoolDigger Star Rating', data=EduData)
plt.title('SchoolDigger Star Rating and Expeditures per Pupil')
plt.xlabel('Expeditures per Pupil')
plt.ylabel('SchoolDigger Star Rating')
plt.show()
sns.regplot(x='Income by School County', y='Average_Standard _Score_(2021-22)', data=EduData)
plt.title('Average Standard Score and Income by School County')
plt.xlabel('Income by School County')
plt.ylabel('Average Standard Score')
plt.show()
sns.regplot(x='Income by School County', y='Average_Standard _Score_(2021-22)', data=EduData)
plt.title('Average Standard Score and Income by School County')
plt.xlabel('Income by School County')
plt.ylabel('Average Standard Score')
plt.show()
# Select the columns to use for clustering
X = EduData[['Income by School County', 'Average_Standard _Score_(2021-22)']]
# Perform k-means clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
# Add the cluster labels to the data frame
EduData['Cluster'] = kmeans.labels_
# Plot the clusters
sns.scatterplot(x='Income by School County', y='Average_Standard _Score_(2021-22)', hue='Cluster', data=EduData)
plt.title('Clustering of Education Data')
plt.xlabel('Income by School County')
plt.ylabel('Average Standard Score')
plt.show()
# Set the color and shape for each cluster
colors = ["blue", "green", "red"]
shapes = ["o", "s", "d"]
# Select the columns to use for clustering
X = EduData[['Income by School County', 'Average_Standard _Score_(2021-22)']]
# Perform k-means clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
# Add the cluster labels to the data frame
EduData['Cluster'] = kmeans.labels_
# Plot the clusters with different colors and shapes for each cluster
sns.scatterplot(x='Income by School County', y='Average_Standard _Score_(2021-22)', hue='Cluster', style='Cluster', palette=colors, markers=shapes, data=EduData)
plt.title('Clustering of Education Data')
plt.xlabel('Income by School County')
plt.ylabel('Average Standard Score')
plt.show()