import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# Load the data
df1 = pd.read_csv('EduDataset.csv')
df1.head(3)
print(df1.dtypes)
county                                                                object
Expeditures for Public Elementary and Secondary Schools per Pupil     object
Income by School County                                               object
Sum of Number Students                                                 int64
Count of Average Standard Score (2020-21)1                             int64
Count of Percent African American1                                     int64
Average of Number Full-time Teachers                                 float64
Average of PercentAmericanIndian1                                     object
Average of PercentAsian1                                              object
Average of PercentHispanic1                                           object
Average of PercentPacificIslander1                                    object
Average of PercentTwoorMoreRaces1                                     object
Average of PercentWhite1                                              object
Average_Standard _Score_(2021-22)                                    float64
Average of Rank (2020-21)                                              int64
Average of Rank Change from (2020-21)                                  int64
Average of SchoolDigger Star Rating                                  float64
Average of SchoolDigger Star Rating11                                float64
dtype: object
df1['Income by School County'] = df1['Income by School County'].str.replace(',', '')
df1['Income by School County'] = pd.to_numeric(df1['Income by School County'])
df1['Expeditures for Public Elementary and Secondary Schools per Pupil'] = df1['Expeditures for Public Elementary and Secondary Schools per Pupil'].str.replace(',', '')
df1['Expeditures for Public Elementary and Secondary Schools per Pupil'] = pd.to_numeric(df1['Expeditures for Public Elementary and Secondary Schools per Pupil'])
# Load the data
df_EduDataset= pd.read_csv('EduDataset.csv')
df_EduDataset.head(3)
county Expeditures for Public Elementary and Secondary Schools per Pupil Income by School County Sum of Number Students Count of Average Standard Score (2020-21)1 Count of Percent African American1 Average of Number Full-time Teachers Average of PercentAmericanIndian1 Average of PercentAsian1 Average of PercentHispanic1 Average of PercentPacificIslander1 Average of PercentTwoorMoreRaces1 Average of PercentWhite1 Average_Standard _Score_(2021-22) Average of Rank (2020-21) Average of Rank Change from (2020-21) Average of SchoolDigger Star Rating Average of SchoolDigger Star Rating11
0 Surry County 20,729 56,525 240 1 1 23.70 0.00% 0.00% 1.30% 0.80% 6.30% 42.10% 32.30 880 60 1.0 1.0
1 Arlington County 19,293 122,604 12663 24 24 34.23 0.14% 9.30% 28.38% 0.01% 8.45% 42.64% 63.63 398 -8 3.2 3.2
2 Falls Church city 19,262 146,922 518 1 1 28.00 0.00% 8.50% 12.00% 0.00% 11.00% 65.30% 90.10 24 -28 5.0 5.0
# Load the county shapefile for Virginia
counties = gpd.read_file('VirginiaCounty.shp')
counties.head(30)
counties.info()
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 133 entries, 0 to 132
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   STCOFIPS    133 non-null    object  
 1   GNIS        133 non-null    object  
 2   NAME        133 non-null    object  
 3   NAMELSAD    133 non-null    object  
 4   GSOURCE     133 non-null    object  
 5   LADOPT      133 non-null    object  
 6   AREASQMI    133 non-null    float64 
 7   LASTUPDATE  130 non-null    object  
 8   JURISTYPE   133 non-null    object  
 9   geometry    133 non-null    geometry
dtypes: float64(1), geometry(1), object(8)
memory usage: 10.5+ KB
# join the income data with the shapefile by county name
gdf_counties_income = counties.merge(df1, left_on='NAMELSAD', right_on='county')
gdf_counties_income.head(3)
STCOFIPS GNIS NAME NAMELSAD GSOURCE LADOPT AREASQMI LASTUPDATE JURISTYPE geometry ... Average of PercentAsian1 Average of PercentHispanic1 Average of PercentPacificIslander1 Average of PercentTwoorMoreRaces1 Average of PercentWhite1 Average_Standard _Score_(2021-22) Average of Rank (2020-21) Average of Rank Change from (2020-21) Average of SchoolDigger Star Rating Average of SchoolDigger Star Rating11
0 51121 1674630 Montgomery Montgomery County L Y 389.266823 2021-09-23 CO MULTIPOLYGON (((-72947.418 152168.110, -72250.... ... 4.67% 7.17% 0.05% 6.69% 77.19% 68.30 418 58 3.3 3.3
1 51167 1497573 Russell Russell County V N 476.490225 2014-09-24 CO POLYGON ((-203087.739 108906.044, -203120.476 ... ... 0.24% 0.86% 0.00% 0.46% 97.90% 67.60 371 1 3.4 3.4
2 51089 1502770 Henry Henry County T N 384.484406 2021-09-23 CO POLYGON ((-13013.107 92620.310, -13061.861 923... ... 0.29% 15.11% 0.07% 9.57% 51.54% 42.96 785 98 1.8 1.8

3 rows × 28 columns

# create a choropleth map of income by county
fig, ax = plt.subplots(1, 1)
gdf_counties_income.plot(column='Income by School County', cmap='BuGn', ax=ax, legend=True, vmin=0, vmax=150000)
ax.set_title('Income by County in Virginia', y=1.55)
plt.show()

# create a choropleth map of Expeditures for Public Elementary and Secondary Schools per Pupil by county
fig, ax = plt.subplots(1, 1)
gdf_counties_income.plot(column='Expeditures for Public Elementary and Secondary Schools per Pupil', 
                         cmap='BuGn', ax=ax, legend=True, vmin=0, vmax=22000)
ax.set_title('Expeditures for Public Elementary and Secondary Schools per Pupil', y=1.55)
plt.show()

# compute summary statistics for the income column
income_summary = df1['Income by School County'].describe()

# print the income summary table
print(income_summary)
count        130
unique       130
top       56,525
freq           1
Name: Income by School County, dtype: object
# compute summary statistics for the Expeditures for Public Elementary and Secondary Schools per Pupil column
Expeditures_summary = df1['Expeditures for Public Elementary and Secondary Schools per Pupil'].describe()

# print the income summary table
print(Expeditures_summary)
count        130
unique       126
top       11,978
freq           2
Name: Expeditures for Public Elementary and Secondary Schools per Pupil, dtype: object
# join last datset by county name
educationQualityDataset = gdf_counties_income.merge(df_EduDataset, left_on='NAMELSAD', right_on='county')
educationQualityDataset.head()
STATEFP COUNTYFP COUSUBFP COUSUBNS GEOID NAME NAMELSAD LSAD CLASSFP MTFCC ... Average of PercentAsian1_y Average of PercentHispanic1_y Average of PercentPacificIslander1_y Average of PercentTwoorMoreRaces1_y Average of PercentWhite1_y Average_Standard _Score_(2021-22)_y Average of Rank (2020-21)_y Average of Rank Change from (2020-21)_y Average of SchoolDigger Star Rating_y Average of SchoolDigger Star Rating11_y
0 51 570 90948 01498420 5157090948 Colonial Heights Colonial Heights city 25 Z7 G4040 ... 2.90% 15.43% 0.00% 10.87% 44.77% 45.83 716 63 1.7 1.7
1 51 790 95875 01789075 5179095875 Staunton Staunton city 25 Z7 G4040 ... 1.40% 7.10% 0.07% 14.60% 62.70% 55.03 554 20 2.7 2.7
2 51 540 90780 01789068 5154090780 Charlottesville Charlottesville city 25 Z7 G4040 ... 5.60% 10.98% 0.00% 12.73% 41.92% 40.97 645 -52 1.8 1.8
3 51 700 94851 01498555 5170094851 Newport News Newport News city 25 Z7 G4040 ... 1.86% 16.26% 0.32% 8.61% 21.36% 21.77 915 13 0.8 0.8
4 51 735 95115 01498436 5173595115 Poquoson Poquoson city 25 Z7 G4040 ... 0.90% 4.90% 0.00% 2.90% 89.40% 83.10 31 -133 4.0 4.0

5 rows × 55 columns

# Load the data
EduData = pd.read_csv('EduDataset.csv')
EduData.head(3)
EduData.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 18 columns):
 #   Column                                                             Non-Null Count  Dtype  
---  ------                                                             --------------  -----  
 0   county                                                             130 non-null    object 
 1   Expeditures for Public Elementary and Secondary Schools per Pupil  130 non-null    object 
 2   Income by School County                                            130 non-null    object 
 3   Sum of Number Students                                             130 non-null    int64  
 4   Count of Average Standard Score (2020-21)1                         130 non-null    int64  
 5   Count of Percent African American1                                 130 non-null    int64  
 6   Average of Number Full-time Teachers                               130 non-null    float64
 7   Average of PercentAmericanIndian1                                  130 non-null    object 
 8   Average of PercentAsian1                                           130 non-null    object 
 9   Average of PercentHispanic1                                        130 non-null    object 
 10  Average of PercentPacificIslander1                                 130 non-null    object 
 11  Average of PercentTwoorMoreRaces1                                  130 non-null    object 
 12  Average of PercentWhite1                                           130 non-null    object 
 13  Average_Standard _Score_(2021-22)                                  130 non-null    float64
 14  Average of Rank (2020-21)                                          130 non-null    int64  
 15  Average of Rank Change from (2020-21)                              130 non-null    int64  
 16  Average of SchoolDigger Star Rating                                130 non-null    float64
 17  Average of SchoolDigger Star Rating11                              130 non-null    float64
dtypes: float64(4), int64(5), object(9)
memory usage: 18.4+ KB
print(EduData.dtypes)
county                                                                object
Expeditures for Public Elementary and Secondary Schools per Pupil     object
Income by School County                                               object
Sum of Number Students                                                 int64
Count of Average Standard Score (2020-21)1                             int64
Count of Percent African American1                                     int64
Average of Number Full-time Teachers                                 float64
Average of PercentAmericanIndian1                                     object
Average of PercentAsian1                                              object
Average of PercentHispanic1                                           object
Average of PercentPacificIslander1                                    object
Average of PercentTwoorMoreRaces1                                     object
Average of PercentWhite1                                              object
Average_Standard _Score_(2021-22)                                    float64
Average of Rank (2020-21)                                              int64
Average of Rank Change from (2020-21)                                  int64
Average of SchoolDigger Star Rating                                  float64
Average of SchoolDigger Star Rating11                                float64
dtype: object
EduData['Income by School County'] = EduData['Income by School County'].str.replace(',', '').astype(int)
EduData['Average of Number Full-time Teachers'] = EduData['Average of Number Full-time Teachers'].replace(',', '').astype(int)
import seaborn as sns
import matplotlib.pyplot as plt

sns.regplot(x='Income by School County', y='Expeditures for Public Elementary and Secondary Schools per Pupil', data=EduData)
plt.title('Correlation between Income by School County and Expeditures per Pupil')
plt.xlabel('Income by School County')
plt.ylabel('Expeditures per Pupil')
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt

sns.regplot(x='Expeditures for Public Elementary and Secondary Schools per Pupil', y='Average of SchoolDigger Star Rating', data=EduData)
plt.title('SchoolDigger Star Rating and Expeditures per Pupil')
plt.xlabel('Expeditures per Pupil')
plt.ylabel('SchoolDigger Star Rating')
plt.show()

sns.regplot(x='Income by School County', y='Average_Standard _Score_(2021-22)', data=EduData)
plt.title('Average Standard Score and Income by School County')
plt.xlabel('Income by School County')
plt.ylabel('Average Standard Score')
plt.show()

sns.regplot(x='Income by School County', y='Average_Standard _Score_(2021-22)', data=EduData)
plt.title('Average Standard Score and Income by School County')
plt.xlabel('Income by School County')
plt.ylabel('Average Standard Score')
plt.show()

# Select the columns to use for clustering
X = EduData[['Income by School County', 'Average_Standard _Score_(2021-22)']]

# Perform k-means clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)

# Add the cluster labels to the data frame
EduData['Cluster'] = kmeans.labels_

# Plot the clusters
sns.scatterplot(x='Income by School County', y='Average_Standard _Score_(2021-22)', hue='Cluster', data=EduData)
plt.title('Clustering of Education Data')
plt.xlabel('Income by School County')
plt.ylabel('Average Standard Score')
plt.show()

# Set the color and shape for each cluster
colors = ["blue", "green", "red"]
shapes = ["o", "s", "d"]

# Select the columns to use for clustering
X = EduData[['Income by School County', 'Average_Standard _Score_(2021-22)']]

# Perform k-means clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)

# Add the cluster labels to the data frame
EduData['Cluster'] = kmeans.labels_

# Plot the clusters with different colors and shapes for each cluster
sns.scatterplot(x='Income by School County', y='Average_Standard _Score_(2021-22)', hue='Cluster', style='Cluster', palette=colors, markers=shapes, data=EduData)
plt.title('Clustering of Education Data')
plt.xlabel('Income by School County')
plt.ylabel('Average Standard Score')
plt.show()