#Loading data into python
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn' 

df = pd.read_csv("2012__AP_Results.csv")
df2 = pd.read_csv("2012_SAT_Results.csv")
df2 = df2.drop(labels="SCHOOL NAME", axis=1)
df_both = pd.merge(df, df2, on="DBN")
df_both.head()


df_both.isna().any() 
# -> No Missing Values! Look for stand-ins ("s")

DBN                                False
SCHOOL NAME                        False
Num of AP Test Takers              False
Num of AP Total Exams Taken        False
Num of AP Exams Passed             False
Num of SAT Test Takers             False
SAT Critical Reading Avg. Score    False
SAT Math Avg. Score                False
SAT Writing Avg. Score             False
dtype: bool


df_both.replace(to_replace="s", value=None, inplace=True)


df_both.isna().any()

DBN                                False
SCHOOL NAME                        False
Num of AP Test Takers               True
Num of AP Total Exams Taken         True
Num of AP Exams Passed              True
Num of SAT Test Takers              True
SAT Critical Reading Avg. Score     True
SAT Math Avg. Score                 True
SAT Writing Avg. Score              True
dtype: bool


#Initial data frame cleaning
df_clean = df_both.dropna(how="any", inplace=False)
df_clean.drop("Num of AP Test Takers", axis=1, inplace=True)
df_clean['Num of AP Total Exams Taken'] = df_clean['Num of AP Total Exams Taken'].astype('int')
df_clean['Num of AP Exams Passed'] = df_clean['Num of AP Exams Passed'].astype('int')
df_clean['Num of SAT Test Takers'] = df_clean['Num of SAT Test Takers'].astype('int')
df_clean['SAT Critical Reading Avg. Score'] = df_clean['SAT Critical Reading Avg. Score'].astype('int')
df_clean['SAT Math Avg. Score'] = df_clean['SAT Math Avg. Score'].astype('int')
df_clean['SAT Writing Avg. Score'] = df_clean['SAT Writing Avg. Score'].astype('int')


#Simple descriptive stats
df_clean["SAT Math Avg. Score"].describe()

count    174.000000
mean     447.816092
std       74.561303
min      323.000000
25%      396.250000
50%      434.000000
75%      478.000000
max      735.000000
Name: SAT Math Avg. Score, dtype: float64


df_clean["SAT Critical Reading Avg. Score"].describe()

count    174.000000
mean     423.270115
std       65.965848
min      300.000000
25%      383.250000
50%      408.500000
75%      444.000000
max      679.000000
Name: SAT Critical Reading Avg. Score, dtype: float64


df_clean["SAT Writing Avg. Score"].describe()

count    174.000000
mean     419.977011
std       69.047856
min      298.000000
25%      380.000000
50%      402.500000
75%      442.000000
max      682.000000
Name: SAT Writing Avg. Score, dtype: float64


#Mapping %ap test pass on X, three lines for SAT scores on Y, best fit lines included to see correlation
#-> dashed at beginning because less values, higher % of AP test pass is more correlated with higher SAT scores
fig, (ax_math, ax_read, ax_write) = plt.subplots(3, figsize = (12,12))
X = df_clean['Num of AP Exams Passed'].values / df_clean['Num of AP Total Exams Taken'].values
Y_math = df_clean['SAT Math Avg. Score'].values
Y_read = df_clean['SAT Critical Reading Avg. Score'].values
Y_write = df_clean['SAT Writing Avg. Score'].values

ax_math.scatter(X, Y_math)
ax_math.set_ylim([200,800])
a, b = np.polyfit(X, Y_math, 1)
ax_math.plot(X, a*X+b, "--", color="purple")#, "loosely dotted")

ax_read.scatter(X, Y_read)
ax_read.set_ylim([200,800])
a, b = np.polyfit(X, Y_read, 1)
ax_read.plot(X, a*X+b, "--", color="purple")

ax_write.scatter(X, Y_write)
ax_write.set_ylim([200,800])
a, b = np.polyfit(X, Y_write, 1)
ax_write.plot(X, a*X+b, "--", color="purple")

plt.xlabel("AP Test Pass %")
plt.suptitle("SAT scores vs. %AP test passed")
plt.setp(ax_math, ylabel = "SAT Math Scores")
plt.setp(ax_read, ylabel = "SAT Reading Scores")
plt.setp(ax_write, ylabel = "SAT Writing Scores")
fig.tight_layout()
plt.show()


df3 = pd.read_csv("2011-2012_High_School_Progress_Report.csv")


#Replacing stand-ins with missing values
df3.replace(to_replace="s", value=None, inplace=True)
df3.replace(to_replace=".", value=None, inplace=True)
df3.replace(to_replace="", value=None, inplace=True)
df3.replace(to_replace=" ", value=None, inplace=True)


df_all = pd.merge(df_both, df3, on="DBN")


df_all.columns

Index(['DBN', 'SCHOOL NAME', 'Num of AP Test Takers',
       'Num of AP Total Exams Taken', 'Num of AP Exams Passed',
       'Num of SAT Test Takers', 'SAT Critical Reading Avg. Score',
       'SAT Math Avg. Score', 'SAT Writing Avg. Score', 'School Name',
       'School Type', 'Overall Score', 'Overall Grade', 'Percentile Rank',
       'Progress Grade', 'Performance Grade', 'Environment Grade',
       'College and Career Readiness Grade',
       'Closing the Achievement Gap Points', 'Principal', 'Enrollment',
       '% Students with Disabilites', '% Students in Self-contained Settings',
       '% Overage', '% Free Lunch', '% Black or Hispanic', '% ELL',
       '8th Gr Math/ELA', 'Peer Index'],
      dtype='object')


#Remove extraneous values
#Overall Grade, School Name are both repetetive
df_all.drop(df_all.loc[df_all["DBN"] == "06M462"].index, axis=0, inplace=True)
df_all.drop(["Principal", "School Name", "% ELL",
            '% Students in Self-contained Settings', 
            "School Type", 'Progress Grade', 'Performance Grade', 'Environment Grade',
            "% Overage", "Overall Grade"], axis=1, inplace=True)


#DF Cleaning, initializing test scores into numpy arrays
df_all_clean = df_all.dropna(how="any", inplace=False)
df_all_clean['Num of AP Exams Passed'] = df_all_clean['Num of AP Exams Passed'].astype('int')
df_all_clean['Num of AP Total Exams Taken'] = df_all_clean['Num of AP Total Exams Taken'].astype('int')
df_all_clean['SAT Critical Reading Avg. Score'] = df_all_clean['SAT Critical Reading Avg. Score'].astype('int')
df_all_clean['SAT Math Avg. Score'] = df_all_clean['SAT Math Avg. Score'].astype('int')
df_all_clean['SAT Writing Avg. Score'] = df_all_clean['SAT Writing Avg. Score'].astype('int')
df_all_clean['% Free Lunch'] = df_all_clean['% Free Lunch'].astype('float')
df_all_clean['% Black or Hispanic'] = df_all_clean['% Black or Hispanic'].astype('float')
df_all_clean['8th Gr Math/ELA'] = df_all_clean['8th Gr Math/ELA'].astype('float')
df_all_clean['Peer Index'] = df_all_clean['Peer Index'].astype('float')
df_all_clean['Closing the Achievement Gap Points'] = df_all_clean['Closing the Achievement Gap Points'].astype('float')

XAP = df_all_clean['Num of AP Exams Passed'].values / df_all_clean['Num of AP Total Exams Taken'].values
XMath = df_all_clean['SAT Math Avg. Score'].values
XRead = df_all_clean['SAT Critical Reading Avg. Score'].values
XWrite = df_all_clean['SAT Writing Avg. Score'].values


#Creating a 3D scatter plot, initializing variables into np arrays, plotting them as x and y, using test scores as the colormap
free_lunch = df_all_clean['% Free Lunch'].values
minority = df_all_clean['% Black or Hispanic'].values

import seaborn as sns
cmap = sns.cubehelix_palette(as_cmap=True, start=1.7, rot=.75)
f, ax = plt.subplots(2, 2, figsize = (12,10))

points = ax[0,0].scatter(free_lunch, minority, c=XAP, s=50, cmap=cmap)
plt.setp(ax[0,0], xlabel = "% Free Lunch", ylabel = "% Minority")
cb = plt.colorbar(points, ax=ax[0,0])
cb.set_label("AP Test Pass %", rotation = 270, labelpad = 20)

points2 = ax[0,1].scatter(free_lunch, minority, c=XMath, s=50, cmap=cmap)
plt.setp(ax[0,1], xlabel = "% Free Lunch", ylabel = "% Minority")
cb2 = plt.colorbar(points2, ax=ax[0,1])
cb2.set_label("SAT Math Score", rotation = 270, labelpad = 20)

points3 = ax[1,0].scatter(free_lunch, minority, c=XRead, s=50, cmap=cmap)
plt.setp(ax[1,0], xlabel = "% Free Lunch", ylabel = "% Minority")
cb3 = plt.colorbar(points3, ax=ax[1,0])
cb3.set_label("SAT Reading Score", rotation = 270, labelpad = 20)

points4 = ax[1,1].scatter(free_lunch, minority, c=XWrite, s=50, cmap=cmap)
plt.setp(ax[1,1], xlabel = "% Free Lunch", ylabel = "% Minority")
cb4 = plt.colorbar(points4, ax=ax[1,1])
cb4.set_label("SAT Writing Score", rotation = 270, labelpad = 20)

plt.show()
#x is length, y is diameter, color is rings


minority = df_all_clean['% Black or Hispanic'].values
acheivment_gap = df_all_clean["Closing the Achievement Gap Points"].values

f, ax = plt.subplots(2, 2, figsize = (12,10))

points = ax[0,0].scatter(minority, acheivment_gap, c=XAP, s=50, cmap=cmap)
plt.setp(ax[0,0], xlabel = "% Minority", ylabel = "Closing the Achievement Gap Points")
cb = plt.colorbar(points, ax=ax[0,0])
cb.set_label("AP Test Pass %", rotation = 270, labelpad = 20)

points2 = ax[0,1].scatter(minority, acheivment_gap, c=XMath, s=50, cmap=cmap)
plt.setp(ax[0,1], xlabel = "% Minority", ylabel = "Closing the Achievement Gap Points")
cb2 = plt.colorbar(points2, ax=ax[0,1])
cb2.set_label("SAT Math Score", rotation = 270, labelpad = 20)

points3 = ax[1,0].scatter(minority, acheivment_gap, c=XRead, s=50, cmap=cmap)
plt.setp(ax[1,0], xlabel = "% Minority", ylabel = "Closing the Achievement Gap Points")
cb3 = plt.colorbar(points3, ax=ax[1,0])
cb3.set_label("SAT Reading Score", rotation = 270, labelpad = 20)

points4 = ax[1,1].scatter(minority, acheivment_gap, c=XWrite, s=50, cmap=cmap)
plt.setp(ax[1,1], xlabel = "% Minority", ylabel = "Closing the Achievement Gap Points")
cb4 = plt.colorbar(points4, ax=ax[1,1])
cb4.set_label("SAT Writing Score", rotation = 270, labelpad = 20)

plt.suptitle("Closing the Achievement Gap's effect on Standardized Tests", y=0.925)
plt.show()


import seaborn as sns
cmap = sns.cubehelix_palette(as_cmap=True, start=1.7, rot=.75)
eighth_grade_test = df_all_clean['8th Gr Math/ELA'].values
peer_index = df_all_clean['Peer Index'].values

f, ax = plt.subplots(2, 2, figsize = (12,10))

points = ax[0,0].scatter(eighth_grade_test, peer_index, c=XAP, s=50, cmap=cmap)
plt.setp(ax[0,0], xlabel = "Eight Grade Math/ELA Test", ylabel = "Peer Index Score")
cb = plt.colorbar(points, ax=ax[0,0])
cb.set_label("AP Test Pass %", rotation = 270, labelpad = 20)

points2 = ax[0,1].scatter(eighth_grade_test, peer_index, c=XMath, s=50, cmap=cmap)
plt.setp(ax[0,1], xlabel = "Eight Grade Math/ELA Test", ylabel = "Peer Index Score")
cb2 = plt.colorbar(points2, ax=ax[0,1])
cb2.set_label("SAT Math Score", rotation = 270, labelpad = 20)

points3 = ax[1,0].scatter(eighth_grade_test, peer_index, c=XRead, s=50, cmap=cmap)
plt.setp(ax[1,0], xlabel = "Eight Grade Math/ELA Test", ylabel = "Peer Index Score")
cb3 = plt.colorbar(points3, ax=ax[1,0])
cb3.set_label("SAT Reading Score", rotation = 270, labelpad = 20)

points4 = ax[1,1].scatter(eighth_grade_test, peer_index, c=XWrite, s=50, cmap=cmap)
plt.setp(ax[1,1], xlabel = "Eight Grade Math/ELA Test", ylabel = "Peer Index Score")
cb4 = plt.colorbar(points4, ax=ax[1,1])
cb4.set_label("SAT Writing Score", rotation = 270, labelpad = 20)

plt.suptitle("Peer Index and Eight Grade Testing's impact on Standardized Tests", y=0.925)
plt.show()


#PLOT COLLEGE READINESS VS AP TEST AND SAT(<- IT SEEMS THAT SAT's DO NOT CORRELATE WITH GOOD AP TEST SCORES
#OR WITH ANY METRIC OF SOCIOECONOMIC STATUS, SEE IF COLLEGE READINESS IS MORE CORRELATED WITH AP TESTS OR WITH SAT;
#IF NOT CORRELATED WITH AP TESTS, MAYBE AP TEST IS BAD. IF IT IS CORRELATED, MAYBE THEY ARE GOOD AND SAT's ARE JUST A MEASURE
#OF WEALTH <- PRECONCEIVED NOTION).

#College and Career Readiness Score is incomplete for a lot of schools, but the grade is pretty complete.

df_all_clean.replace(to_replace=("A","B","C","D","F"), value=(5,4,3,2,1), inplace=True)
readiness = df_all_clean["College and Career Readiness Grade"].values

f, ax = plt.subplots(2, 2, figsize = (12,10))

box1 = ax[0,0].boxplot([
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 1, 'Num of AP Exams Passed'].values / df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 1, 'Num of AP Total Exams Taken'].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 2, 'Num of AP Exams Passed'].values / df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 2, 'Num of AP Total Exams Taken'].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 3, 'Num of AP Exams Passed'].values / df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 3, 'Num of AP Total Exams Taken'].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 4, 'Num of AP Exams Passed'].values / df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 4, 'Num of AP Total Exams Taken'].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 5, 'Num of AP Exams Passed'].values / df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 5, 'Num of AP Total Exams Taken'].values
], positions = [1,2,3,4,5], showmeans=True)
plt.setp(ax[0,0], xlabel = "College and Career Readiness Grade", ylabel = "AP Test Pass %",)
plt.setp(box1["medians"], color="purple")

box2 = ax[0,1].boxplot([
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 1, "SAT Math Avg. Score"].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 2, "SAT Math Avg. Score"].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 3, "SAT Math Avg. Score"].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 4, "SAT Math Avg. Score"].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 5, "SAT Math Avg. Score"].values
], positions = [1,2,3,4,5], showmeans=True)
plt.setp(ax[0,1], xlabel = "College and Career Readiness Grade", ylabel = "SAT Math Score", ylim=(200,800))
plt.setp(box2["medians"], color="purple")

box3 = ax[1,0].boxplot([
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 1, "SAT Critical Reading Avg. Score"].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 2, "SAT Critical Reading Avg. Score"].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 3, "SAT Critical Reading Avg. Score"].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 4, "SAT Critical Reading Avg. Score"].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 5, "SAT Critical Reading Avg. Score"].values
], positions = [1,2,3,4,5], showmeans=True)
plt.setp(ax[1,0], xlabel = "College and Career Readiness Grade", ylabel = "SAT Reading Score", ylim=(200,800))
plt.setp(box3["medians"], color="purple")

box4 = ax[1,1].boxplot([
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 1, "SAT Writing Avg. Score"].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 2, "SAT Writing Avg. Score"].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 3, "SAT Writing Avg. Score"].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 4, "SAT Writing Avg. Score"].values, 
    df_all_clean.loc[df_all_clean["College and Career Readiness Grade"] == 5, "SAT Writing Avg. Score"].values, 
], positions = [1,2,3,4,5], showmeans=True)
plt.setp(ax[1,1], xlabel = "College and Career Readiness Grade", ylabel = "SAT Writing Score", ylim=(200,800))
plt.setp(box4["medians"], color="purple")

for i in range(2):
    for j in range(2):
        ax[i,j].set_xticklabels(["F", "D", "C", "B", "A"])


plt.suptitle("College and Career Readiness's effect on Standardized Tests", y=0.925)
plt.show()

	DBN	SCHOOL NAME	Num of AP Test Takers	Num of AP Total Exams Taken	Num of AP Exams Passed	Num of SAT Test Takers	SAT Critical Reading Avg. Score	SAT Math Avg. Score	SAT Writing Avg. Score
0	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES	s	s	s	29	355	404	363
1	01M448	UNIVERSITY NEIGHBORHOOD HIGH SCHOOL	37	53	21	91	383	423	366
2	01M450	EAST SIDE COMMUNITY SCHOOL	12	12	s	70	377	402	370
3	01M458	FORSYTH SATELLITE ACADEMY	s	s	s	7	414	401	359
4	01M509	MARTA VALLE HIGH SCHOOL	14	15	s	44	390	433	384

New York Testing Analysis¶

Elementary Data Analysis¶

Sumanth Kolli¶

Table of Contents:¶

Introduction ¶

Part 1: SAT vs AP test scores ¶

Analysis¶

Part two: Compared to Census ¶

Part 2a: Socioeconomic Metrics ¶

Analysis¶

Analysis¶

Part 2b: Academic Metrics ¶

Analysis¶

Analysis¶

Conclusion ¶

New York Testing Analysis¶

Elementary Data Analysis¶

Sumanth Kolli¶

Table of Contents:¶

Introduction ¶

Part 1: SAT vs AP test scores ¶

Analysis¶

Part two: Compared to Census¶

Part 2a: Socioeconomic Metrics¶

Analysis¶

Analysis¶

Part 2b: Academic Metrics ¶

Analysis¶

Analysis¶

Conclusion¶

Part two: Compared to Census ¶

Part 2a: Socioeconomic Metrics ¶

Conclusion ¶