273027

import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Load the data
df1 = pd.read_csv('EduDataset.csv')
df1.head(3)
print(df1.dtypes)

county                                                                object
Expeditures for Public Elementary and Secondary Schools per Pupil     object
Income by School County                                               object
Sum of Number Students                                                 int64
Count of Average Standard Score (2020-21)1                             int64
Count of Percent African American1                                     int64
Average of Number Full-time Teachers                                 float64
Average of PercentAmericanIndian1                                     object
Average of PercentAsian1                                              object
Average of PercentHispanic1                                           object
Average of PercentPacificIslander1                                    object
Average of PercentTwoorMoreRaces1                                     object
Average of PercentWhite1                                              object
Average_Standard _Score_(2021-22)                                    float64
Average of Rank (2020-21)                                              int64
Average of Rank Change from (2020-21)                                  int64
Average of SchoolDigger Star Rating                                  float64
Average of SchoolDigger Star Rating11                                float64
dtype: object

df1['Income by School County'] = df1['Income by School County'].str.replace(',', '')
df1['Income by School County'] = pd.to_numeric(df1['Income by School County'])

df1['Expeditures for Public Elementary and Secondary Schools per Pupil'] = df1['Expeditures for Public Elementary and Secondary Schools per Pupil'].str.replace(',', '')
df1['Expeditures for Public Elementary and Secondary Schools per Pupil'] = pd.to_numeric(df1['Expeditures for Public Elementary and Secondary Schools per Pupil'])

# Load the data
df_EduDataset= pd.read_csv('EduDataset.csv')
df_EduDataset.head(3)

	county	Expeditures for Public Elementary and Secondary Schools per Pupil	Income by School County	Sum of Number Students	Count of Average Standard Score (2020-21)1	Count of Percent African American1	Average of Number Full-time Teachers	Average of PercentAmericanIndian1	Average of PercentAsian1	Average of PercentHispanic1	Average of PercentPacificIslander1	Average of PercentTwoorMoreRaces1	Average of PercentWhite1	Average_Standard _Score_(2021-22)	Average of Rank (2020-21)	Average of Rank Change from (2020-21)	Average of SchoolDigger Star Rating	Average of SchoolDigger Star Rating11
0	Surry County	20,729	56,525	240	1	1	23.70	0.00%	0.00%	1.30%	0.80%	6.30%	42.10%	32.30	880	60	1.0	1.0
1	Arlington County	19,293	122,604	12663	24	24	34.23	0.14%	9.30%	28.38%	0.01%	8.45%	42.64%	63.63	398	-8	3.2	3.2
2	Falls Church city	19,262	146,922	518	1	1	28.00	0.00%	8.50%	12.00%	0.00%	11.00%	65.30%	90.10	24	-28	5.0	5.0

# Load the county shapefile for Virginia
counties = gpd.read_file('VirginiaCounty.shp')
counties.head(30)
counties.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 133 entries, 0 to 132
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   STCOFIPS    133 non-null    object  
 1   GNIS        133 non-null    object  
 2   NAME        133 non-null    object  
 3   NAMELSAD    133 non-null    object  
 4   GSOURCE     133 non-null    object  
 5   LADOPT      133 non-null    object  
 6   AREASQMI    133 non-null    float64 
 7   LASTUPDATE  130 non-null    object  
 8   JURISTYPE   133 non-null    object  
 9   geometry    133 non-null    geometry
dtypes: float64(1), geometry(1), object(8)
memory usage: 10.5+ KB

# join the income data with the shapefile by county name
gdf_counties_income = counties.merge(df1, left_on='NAMELSAD', right_on='county')
gdf_counties_income.head(3)

	STCOFIPS	GNIS	NAME	NAMELSAD	GSOURCE	LADOPT	AREASQMI	LASTUPDATE	JURISTYPE	geometry	...	Average of PercentAsian1	Average of PercentHispanic1	Average of PercentPacificIslander1	Average of PercentTwoorMoreRaces1	Average of PercentWhite1	Average_Standard _Score_(2021-22)	Average of Rank (2020-21)	Average of Rank Change from (2020-21)	Average of SchoolDigger Star Rating	Average of SchoolDigger Star Rating11
0	51121	1674630	Montgomery	Montgomery County	L	Y	389.266823	2021-09-23	CO	MULTIPOLYGON (((-72947.418 152168.110, -72250....	...	4.67%	7.17%	0.05%	6.69%	77.19%	68.30	418	58	3.3	3.3
1	51167	1497573	Russell	Russell County	V	N	476.490225	2014-09-24	CO	POLYGON ((-203087.739 108906.044, -203120.476 ...	...	0.24%	0.86%	0.00%	0.46%	97.90%	67.60	371	1	3.4	3.4
2	51089	1502770	Henry	Henry County	T	N	384.484406	2021-09-23	CO	POLYGON ((-13013.107 92620.310, -13061.861 923...	...	0.29%	15.11%	0.07%	9.57%	51.54%	42.96	785	98	1.8	1.8

3 rows × 28 columns

# create a choropleth map of income by county
fig, ax = plt.subplots(1, 1)
gdf_counties_income.plot(column='Income by School County', cmap='BuGn', ax=ax, legend=True, vmin=0, vmax=150000)
ax.set_title('Income by County in Virginia', y=1.55)
plt.show()

# create a choropleth map of Expeditures for Public Elementary and Secondary Schools per Pupil by county
fig, ax = plt.subplots(1, 1)
gdf_counties_income.plot(column='Expeditures for Public Elementary and Secondary Schools per Pupil', 
                         cmap='BuGn', ax=ax, legend=True, vmin=0, vmax=22000)
ax.set_title('Expeditures for Public Elementary and Secondary Schools per Pupil', y=1.55)
plt.show()

# compute summary statistics for the income column
income_summary = df1['Income by School County'].describe()

# print the income summary table
print(income_summary)

count        130
unique       130
top       56,525
freq           1
Name: Income by School County, dtype: object

# compute summary statistics for the Expeditures for Public Elementary and Secondary Schools per Pupil column
Expeditures_summary = df1['Expeditures for Public Elementary and Secondary Schools per Pupil'].describe()

# print the income summary table
print(Expeditures_summary)

count        130
unique       126
top       11,978
freq           2
Name: Expeditures for Public Elementary and Secondary Schools per Pupil, dtype: object

# join last datset by county name
educationQualityDataset = gdf_counties_income.merge(df_EduDataset, left_on='NAMELSAD', right_on='county')
educationQualityDataset.head()

	STATEFP	COUNTYFP	COUSUBFP	COUSUBNS	GEOID	NAME	NAMELSAD	LSAD	CLASSFP	MTFCC	...	Average of PercentAsian1_y	Average of PercentHispanic1_y	Average of PercentPacificIslander1_y	Average of PercentTwoorMoreRaces1_y	Average of PercentWhite1_y	Average_Standard _Score_(2021-22)_y	Average of Rank (2020-21)_y	Average of Rank Change from (2020-21)_y	Average of SchoolDigger Star Rating_y	Average of SchoolDigger Star Rating11_y
0	51	570	90948	01498420	5157090948	Colonial Heights	Colonial Heights city	25	Z7	G4040	...	2.90%	15.43%	0.00%	10.87%	44.77%	45.83	716	63	1.7	1.7
1	51	790	95875	01789075	5179095875	Staunton	Staunton city	25	Z7	G4040	...	1.40%	7.10%	0.07%	14.60%	62.70%	55.03	554	20	2.7	2.7
2	51	540	90780	01789068	5154090780	Charlottesville	Charlottesville city	25	Z7	G4040	...	5.60%	10.98%	0.00%	12.73%	41.92%	40.97	645	-52	1.8	1.8
3	51	700	94851	01498555	5170094851	Newport News	Newport News city	25	Z7	G4040	...	1.86%	16.26%	0.32%	8.61%	21.36%	21.77	915	13	0.8	0.8
4	51	735	95115	01498436	5173595115	Poquoson	Poquoson city	25	Z7	G4040	...	0.90%	4.90%	0.00%	2.90%	89.40%	83.10	31	-133	4.0	4.0

5 rows × 55 columns

# Load the data
EduData = pd.read_csv('EduDataset.csv')
EduData.head(3)
EduData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 18 columns):
 #   Column                                                             Non-Null Count  Dtype  
---  ------                                                             --------------  -----  
 0   county                                                             130 non-null    object 
 1   Expeditures for Public Elementary and Secondary Schools per Pupil  130 non-null    object 
 2   Income by School County                                            130 non-null    object 
 3   Sum of Number Students                                             130 non-null    int64  
 4   Count of Average Standard Score (2020-21)1                         130 non-null    int64  
 5   Count of Percent African American1                                 130 non-null    int64  
 6   Average of Number Full-time Teachers                               130 non-null    float64
 7   Average of PercentAmericanIndian1                                  130 non-null    object 
 8   Average of PercentAsian1                                           130 non-null    object 
 9   Average of PercentHispanic1                                        130 non-null    object 
 10  Average of PercentPacificIslander1                                 130 non-null    object 
 11  Average of PercentTwoorMoreRaces1                                  130 non-null    object 
 12  Average of PercentWhite1                                           130 non-null    object 
 13  Average_Standard _Score_(2021-22)                                  130 non-null    float64
 14  Average of Rank (2020-21)                                          130 non-null    int64  
 15  Average of Rank Change from (2020-21)                              130 non-null    int64  
 16  Average of SchoolDigger Star Rating                                130 non-null    float64
 17  Average of SchoolDigger Star Rating11                              130 non-null    float64
dtypes: float64(4), int64(5), object(9)
memory usage: 18.4+ KB

print(EduData.dtypes)

county                                                                object
Expeditures for Public Elementary and Secondary Schools per Pupil     object
Income by School County                                               object
Sum of Number Students                                                 int64
Count of Average Standard Score (2020-21)1                             int64
Count of Percent African American1                                     int64
Average of Number Full-time Teachers                                 float64
Average of PercentAmericanIndian1                                     object
Average of PercentAsian1                                              object
Average of PercentHispanic1                                           object
Average of PercentPacificIslander1                                    object
Average of PercentTwoorMoreRaces1                                     object
Average of PercentWhite1                                              object
Average_Standard _Score_(2021-22)                                    float64
Average of Rank (2020-21)                                              int64
Average of Rank Change from (2020-21)                                  int64
Average of SchoolDigger Star Rating                                  float64
Average of SchoolDigger Star Rating11                                float64
dtype: object

EduData['Income by School County'] = EduData['Income by School County'].str.replace(',', '').astype(int)

EduData['Average of Number Full-time Teachers'] = EduData['Average of Number Full-time Teachers'].replace(',', '').astype(int)

import seaborn as sns
import matplotlib.pyplot as plt

sns.regplot(x='Income by School County', y='Expeditures for Public Elementary and Secondary Schools per Pupil', data=EduData)
plt.title('Correlation between Income by School County and Expeditures per Pupil')
plt.xlabel('Income by School County')
plt.ylabel('Expeditures per Pupil')
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt

sns.regplot(x='Expeditures for Public Elementary and Secondary Schools per Pupil', y='Average of SchoolDigger Star Rating', data=EduData)
plt.title('SchoolDigger Star Rating and Expeditures per Pupil')
plt.xlabel('Expeditures per Pupil')
plt.ylabel('SchoolDigger Star Rating')
plt.show()

sns.regplot(x='Income by School County', y='Average_Standard _Score_(2021-22)', data=EduData)
plt.title('Average Standard Score and Income by School County')
plt.xlabel('Income by School County')
plt.ylabel('Average Standard Score')
plt.show()

sns.regplot(x='Income by School County', y='Average_Standard _Score_(2021-22)', data=EduData)
plt.title('Average Standard Score and Income by School County')
plt.xlabel('Income by School County')
plt.ylabel('Average Standard Score')
plt.show()

# Select the columns to use for clustering
X = EduData[['Income by School County', 'Average_Standard _Score_(2021-22)']]

# Perform k-means clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)

# Add the cluster labels to the data frame
EduData['Cluster'] = kmeans.labels_

# Plot the clusters
sns.scatterplot(x='Income by School County', y='Average_Standard _Score_(2021-22)', hue='Cluster', data=EduData)
plt.title('Clustering of Education Data')
plt.xlabel('Income by School County')
plt.ylabel('Average Standard Score')
plt.show()

# Set the color and shape for each cluster
colors = ["blue", "green", "red"]
shapes = ["o", "s", "d"]

# Select the columns to use for clustering
X = EduData[['Income by School County', 'Average_Standard _Score_(2021-22)']]

# Perform k-means clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)

# Add the cluster labels to the data frame
EduData['Cluster'] = kmeans.labels_

# Plot the clusters with different colors and shapes for each cluster
sns.scatterplot(x='Income by School County', y='Average_Standard _Score_(2021-22)', hue='Cluster', style='Cluster', palette=colors, markers=shapes, data=EduData)
plt.title('Clustering of Education Data')
plt.xlabel('Income by School County')
plt.ylabel('Average Standard Score')
plt.show()