##This code imports LFS data from SPSS files and finds the employment rate and median salaries for graudates
#based on various characteristics
##
library("foreign")
library("plyr")
library("reshape2")
library("rlang")
library("spatstat")
library("dplyr")
library("lazyeval")
library("survey")
library("tidyr")
#######################################################
# Functions used within the code #
#######################################################
######################################################################################
#Add in the function to change SPSS data to R data
######################################################################################
readLFS<-function(quarter, year){
#Latest data
test<-read.spss(paste0("SPSS datasets/",year," Q",quarter,".sav"),to.data.frame=TRUE)
#Select variables needed
colnames<-c("COUNTRY","SEX","AGE", names(test[grep("PWT", names(test))]),
names(test[grep("PIWT", names(test))]),
names(test[grep("HIQUAL", names(test))]),"HIGHO",
names(test[grep("DEGREE", names(test))]),"SC2KMMJ",
"SC10MMJ","ETH11EW",
"SNGDEGB","SNGHD","FDSNGDEG","ILODEFR","DISEA","DEGCLS7",
"FTPTWK","GRSSWK", "INDE07M","UALA","CASENO","GOVTOF2")
#not all variables are in each dataset
available_columns<-colnames[which(colnames %in% names(test))]
test2<- test[,available_columns]
#restrict to population in England aged 16-64
test2<-test2[which(test2$COUNTRY=="England" & test2$AGE>15 & test2$AGE<65),]
saveRDS(object = test2, file = paste0("Rds datasets/Q",quarter,"_",year,".rds"))
}
#This function will then read the csv, subset and filter as in the function above
#before saving as an RDS file.
Catch_remaining_datasets<-function(quarter, year){
test <-read.csv(paste0("CSV datasets/", year," Q",quarter,".csv"))
if(dim(test)[1]==0){#add in error warning
stop(return(paste0("Warning: dataset Q",quarter," ",year,"won't read from SPSS and doesn't exist as CSV:
convert SPSS file to CSV")))
}
colnames<-c("COUNTRY","SEX","AGE", names(test[grep("PWT", names(test))]),
names(test[grep("PIWT", names(test))]),
names(test[grep("HIQUAL", names(test))]),"HIGHO",
names(test[grep("DEGREE", names(test))]),"SC2KMMJ",
"SC10MMJ","ETH11EW",
"SNGDEGB","SNGHD","FDSNGDEG","ILODEFR","DISEA","DEGCLS7",
"FTPTWK","GRSSWK", "INDE07M","UALA","CASENO","GOVTOF2")
#not all variables are in each dataset
available_columns<-colnames[which(colnames %in% names(test))]
test2<- test[,available_columns]
#restrict to population in England aged 16-64
test2<-test2[which(test2$COUNTRY=="England" & test2$AGE>15 & test2$AGE<65),]
saveRDS(object = test2, file = paste0("Rds datasets/Q",quarter,"_",year,".rds"))
assign(paste0("Q",quarter,"_",year),test2,envir = globalenv())
}
#This function calls up RDs files already saved and loads them into the global environment
read_LFS_from_project<-function(quarter, year){
temp <-readRDS(paste0("Rds datasets/Q",quarter,"_", year,".rds"))
assign(paste0("Q",quarter,"_",year),temp,envir = globalenv())
if(dim(temp)[1]==0){#add in error warning
Catch_remaining_datasets(quarter, year)}
}
######################################################################################
#Add in the function to recode variables in the LFS to those used in the publication
######################################################################################
#Recode variables for ease of grouping in the publication.
recode_variables<- function(quarter, year){
#fetch the dataset from the global environment: for example Q1_2016
dataset<-get(paste0("Q",quarter,"_",year))
#use 'highqual15' for LFS beyond 2014
#For pre 2014 use HIQUAL11<- 2012-2014, HIQUAL5 2006-2011
hiqual_var<-names(dataset[grep("HIQUAL", names(dataset))])[1]
if(class(dataset[,c(hiqual_var)])=="numeric"){
#add in error warning if not a factor/string variable
paste0("Warning: dataset Q",quarter," ",year,"is missing factor levels in HIQUAL")
}
# Check if data is the right type, if not convert to a factor with the correct value levels.
if("HIQUAL15" %in% names(dataset)& class(dataset$HIQUAL15)=="numeric"){
dataset$HIQUAL15<- as.factor(dataset$HIQUAL15)
levels(dataset$HIQUAL15)<-high_qual15
}
##Define Graduate Type
#Define graduates, postgraduates and non-graduates
dataset$Graduate_Type<-NA
if("DEGREE71" %in% names(dataset)){
dataset$Graduate_Type[(dataset[,c(hiqual_var)]=="First degree/foundation degree"|
dataset[,c(hiqual_var)]=="First/Foundation degree"
) & (
dataset$DEGREE71=="first degree" |
dataset$DEGREE72=="first degree" |
dataset$DEGREE73=="first degree" |
dataset$DEGREE74=="first degree" |
dataset$DEGREE75=="first degree" )]<-"Graduate"}
if("DEGREE4" %in% names(dataset)){
dataset$Graduate_Type[(dataset[,c(hiqual_var)]=="First degree/foundation degree"|
dataset[,c(hiqual_var)]=="First/Foundation degree"
) & (
dataset$DEGREE4=="A first degree" )]<-"Graduate"}
dataset$Graduate_Type[dataset[,c(hiqual_var)]=="Higher degree"
& dataset$HIGHO != "Dont know"]<-"PostGrad"
dataset$Graduate_Type[dataset[,c(hiqual_var)]=="Level 8 Diploma"|
dataset[,c(hiqual_var)]=="Level 8 Certificate"|
dataset[,c(hiqual_var)]=="Level 8 Award"|
dataset[,c(hiqual_var)]=="Level 7 Certificate"|
dataset[,c(hiqual_var)]=="Level 7 Diploma"|
dataset[,c(hiqual_var)]=='Level 7 Award'|
dataset[,c(hiqual_var)]=='Level 6 Award'|
dataset[,c(hiqual_var)]=='Level 6 Certificate'|
dataset[,c(hiqual_var)]=='Level 6 Diploma'|
dataset[,c(hiqual_var)]=="Don.t know"|
dataset[,c(hiqual_var)]=="Don't know"|
dataset[,c(hiqual_var)]=="Did not know"|
(dataset[,c(hiqual_var)]=="Higher degree" & dataset$HIGHO =="Dont know")|
(dataset[,c(hiqual_var)]=="Higher degree" & is.na(dataset$HIGHO))
]<-"Not in scope"
#Set everyone who isn't a graduate, post graduate or not in scope to a non graduate
dataset$Graduate_Type[is.na(dataset$Graduate_Type)]<-"Non Grad"
# AgeGroup.
dataset$AgeGroup[dataset$AGE %in% 16:20]<-"16-20"
dataset$AgeGroup[dataset$AGE %in% 21:30]<-"21-30"
dataset$AgeGroup[dataset$AGE %in% 31:40]<-"31-40"
dataset$AgeGroup[dataset$AGE %in% 41:50]<-"41-50"
dataset$AgeGroup[dataset$AGE %in% 51:60]<-"51-60"
#Note error in 2016 (age 60 in last bracket)
dataset$AgeGroup[dataset$AGE %in% 61:64]<-"61-64"
#The remaining variables are added to the latest datasets
if("SC10MMJ" %in% names(dataset)){
#Define skilled occupation groupings
SOCHE1<-c("1 'Managers, Directors And Senior Officials'"
,"2 'Professional Occupations'"
,"3 'Associate Professional And Technical Occupations'")
SOCHE2<-c("4 'Administrative And Secretarial Occupations'"
,"5 'Skilled Trades Occupations'"
,"6 'Caring, Leisure And Other Service Occupations'" )
SOCHE3<-c("7 'Sales And Customer Service Occupations'"
,"8 'Process, Plant And Machine Operatives'"
,"9 'Elementary Occupations'" )
dataset$SOCHE[dataset$SC10MMJ %in% SOCHE1]<-"High Skilled Employment"
dataset$SOCHE[dataset$SC10MMJ %in% SOCHE2]<-"Medium Skilled Employment"
dataset$SOCHE[dataset$SC10MMJ %in% SOCHE3]<-"Low Skilled Employment"
dataset$SOCHE[is.na(dataset$SOCHE)]<-"Else"
}
#Define skilled occupation groupings
if("SC2KMMJ" %in% names(dataset)){
SOCHE1<-c("1 'Managers and Senior Officials'"
,"2 'Professional occupations'"
,"3 'Associate Professional and Technical'" )
SOCHE2<-c("4 'Administrative and Secretarial'"
,"5 'Skilled Trades Occupations'"
,"6 'Personal Service Occupations'" )
SOCHE3<-c("7 'Sales and Customer Service Occupations'"
,"8 'Process, Plant and Machine Operatives'"
,"9 'Elementary Occupations'" )
dataset$SOCHE[dataset$SC2KMMJ %in% SOCHE1]<-"High Skilled Employment"
dataset$SOCHE[dataset$SC2KMMJ %in% SOCHE2]<-"Medium Skilled Employment"
dataset$SOCHE[dataset$SC2KMMJ %in% SOCHE3]<-"Low Skilled Employment"
dataset$SOCHE[is.na(dataset$SOCHE)]<-"Else"
}
if("ETH11EW" %in% names(dataset)){
#Define ethnic group
dataset$ETHNICITY<-dataset$ETH11EW
dataset$ETHNICITY[dataset$ETH11EW %in% c("Mixed / Multiple ethnic groups" ,"Chinese","Arab")]<-"Other ethnic group."
}
#Define Subject Type
if("SNGDEGB" %in% names(dataset)){
dataset$STEM[dataset$SNGDEGB %in% 1:9]<- 1
#10.1~Economics, Law, Business
dataset$LEM[dataset$SNGDEGB %in% 11:12| (substr(dataset$FDSNGDEG,1,4)=="10.1"
& dataset$Graduate_Type=="Graduate")|
(substr(dataset$SNGHD,1,4)=="10.1"
& dataset$Graduate_Type=="PostGrad")]<-1
#10.0~Social Studies
#10.2:10.9~Politics, Sociology, Social Policy, Social Work,Anthropology,
#Human and Social Geography, Other Social Studies
#
dataset$OSSAH[dataset$SNGDEGB %in% 13:19|
(substr(dataset$FDSNGDEG,1,4) %in% c("10.0","10.2","10.3","10.4","10.5",
"10.6","10.7","10.8","10.9")
& dataset$Graduate_Type=="Graduate")|
(substr(dataset$SNGHD,1,4) %in% c("10.0","10.2","10.3","10.4","10.5",
"10.6","10.7","10.8","10.9")
& dataset$Graduate_Type=="PostGrad")]<-1
}
if("SC10MMJ" %in% names(dataset)){
#* Occupation.
dataset$Occupation<-dataset$SC10MMJ
dataset$Occupation<-factor(dataset$Occupation, levels=c(levels(dataset$Occupation),"10 'Medium Skilled Employment'","11 'Low Skilled Employment'"))
dataset$Occupation[dataset$SC10MMJ %in% c("4 'Administrative And Secretarial Occupations'",
"5 'Skilled Trades Occupations'",
"6 'Caring, Leisure And Other Service Occupations'")]<-"10 'Medium Skilled Employment'"
dataset$Occupation[dataset$SC10MMJ %in% c("7 'Sales And Customer Service Occupations'" , "8 'Process, Plant And Machine Operatives'"
,"9 'Elementary Occupations'" )]<-"11 'Low Skilled Employment'"
}
#Labels Occupation
#1 'Managers, Directors and Senior Officials'
#2 'Professional Occupations'
#3 'Associate Professional and Technical Occupations'
#10 'Medium Skilled Employment'
#11 'Low Skilled Employment'.
assign(paste0("Q",quarter,"_",year),dataset,envir = globalenv())
}
#########################################################################################
# Employment rate by demographic #
#########################################################################################
#Headline stats:
#for 16-64s
Headline_stats_function<-function(all,agebracket){
#subset data if needed
if(agebracket=="16-64"){all<- all}else{all<-all[which(all$AgeGroup=="21-30"),]}
all %>%
group_by(Graduate_Type)%>% summarise(total=sum(freq),sample=sum(sample))->full_pop
types<-full_pop["Graduate_Type"]
all[which(all$ILODEFR=="In employment"),] %>%
group_by(Graduate_Type)%>% summarise(count=sum(freq),sample=sum(sample))%>%
merge(x=types, y=., by=c("Graduate_Type"), all.x=TRUE)->in_employment
all[which(all$ILODEFR=="In employment" & all$SOCHE=="High Skilled Employment"),] %>%
group_by(Graduate_Type)%>% summarise(count=sum(freq),sample=sum(sample))%>%
merge(x=types, y=., by=c("Graduate_Type"), all.x=TRUE)->in_HSemployment
all[which(all$ILODEFR=="ILO unemployed"),] %>%
group_by(Graduate_Type)%>% summarise(count=sum(freq),sample=sum(sample))%>%
merge(x=types, y=., by=c("Graduate_Type"), all.x=TRUE)->unemployed
all[which(all$ILODEFR=="Inactive"),] %>%
group_by(Graduate_Type)%>% summarise(count=sum(freq),sample=sum(sample))%>%
merge(x=types, y=., by=c("Graduate_Type"), all.x=TRUE)->inactive
Headline_stats<-data.frame( in_employment$Graduate_Type,
in_employment$count/full_pop$total,
in_HSemployment$count/full_pop$total,
unemployed$count/(unemployed$count + in_employment$count),
inactive$count/full_pop$total,
in_employment$sample,
in_HSemployment$sample,
unemployed$sample,
inactive$sample,
in_employment$count,
in_HSemployment$count,
unemployed$count,
inactive$count)
Headline_stats$Age_range<-agebracket
colnames(Headline_stats)<-c("Population","Employment_rate","High_skill_emp_rate","Unemployment_rate","Inactivity_rate",
"Employed sample","HS employed sample", "unemployed sample", "inactive sample","employed total","HS employed total",
"unemployed total","inactive total","Age_range")
Headline_stats<-Headline_stats[c("Age_range","Population","Employment_rate","High_skill_emp_rate","Unemployment_rate","Inactivity_rate",
"Employed sample","HS employed sample", "unemployed sample", "inactive sample","employed total","HS employed total",
"unemployed total","inactive total")]
}
Employment_rate<-function(quarter, year){
#get the dataset from the global environment
dataset<-get(paste0("Q",quarter,"_",year))
#find the weight variable
weight<-tail(sort(names(dataset[grep("PWT", names(dataset))])),1)
dataset<-dataset[which(dataset[,weight]>0),]
# count all people by age, graduate type, ILO employment status and SOC group: weight people by the weight variable (such as PWT16)
#also note sample size
all<-dataset%>%
group_by(AgeGroup,Graduate_Type,ILODEFR,SOCHE) %>%
summarise(freq=sum(!!sym(weight)), sample=n())
Headline_stats<-Headline_stats_function(all,"16-64")
#Headline stats for 21-30s
Headline_stats_young<-Headline_stats_function(all,"21-30")
headline_stats<-rbind(Headline_stats,Headline_stats_young)
assign(paste0("Headline_Q",quarter,"_",year),headline_stats,envir = globalenv())
#write.csv(headline_stats,paste0("Outputs for GLMS/Headline_Q",quarter,"_",year,".csv"))
}
#combine employment rates for each quarter into a yearly mean average
year_average_employment_rate<-function(year){
#select dataset: for example if quarter =1 and year =2016 this will get Q1_2016 and set this to 'dataset'
dataset1<-get(paste0("Headline_Q1_",year))
dataset2<-get(paste0("Headline_Q2_",year))
dataset3<-get(paste0("Headline_Q3_",year))
dataset4<-get(paste0("Headline_Q4_",year))
year_data<-rbind(dataset1,dataset2,dataset3,dataset4)
year_data%>%
group_by( Age_range, Population) %>%
summarise(Employment_rate=mean.default(Employment_rate),
High_skill_emp_rate= mean.default(High_skill_emp_rate),
Unemployment_rate=mean.default(Unemployment_rate),
inactivity_rate= mean.default(Inactivity_rate),
employed_sample=sum(`Employed sample`),
HS_employed_sample= sum(`HS employed sample`),
unemployed_sample=sum(`unemployed sample`),
inactive_sample=sum(`inactive sample`),
employed_total=sum(`employed total`),
HS_employed_total=sum(`HS employed total`),
unemployed_total=sum(`unemployed total`),
inactive_total=sum(`inactive total`)) -> year_average
year_average$year<-year
assign(paste0("Headline_",year),year_average,envir = globalenv())
#write.csv(year_average,paste0("Outputs for GLMS/Headline_",year,".csv"))
}
Employment_counts_by<-function(dataset,demographic){
weight<-tail(sort(names(dataset[grep("PWT", names(dataset))])),1)
as.data.frame(
dataset%>%
group_by_(demographic,"Graduate_Type","ILODEFR","SOCHE")%>%
dplyr::summarise(sample=n(),
population=sum(!!sym(weight))))
}
variable_splits<-function(dataset, variable_y){
dataset %>%
group_by_(variable_y)%>% summarise(total=sum(population))->full_pop
dataset[which(dataset$ILODEFR=="In employment"),] %>%
group_by_(variable_y)%>% summarise(employed.count=sum(population), employed.sample=sum(sample))->in_employment
dataset[which(dataset$ILODEFR=="In employment" & dataset$SOCHE=="High Skilled Employment"),] %>%
group_by_(variable_y)%>% summarise(HSemp.count=sum(population),HSemp.sample=sum(sample))->in_HSemployment
dataset[which(dataset$ILODEFR=="ILO unemployed"),] %>%
group_by_(variable_y)%>% summarise(Unemp.count=sum(population),Unemp.sample=sum(sample))->unemployed
dataset[which(dataset$ILODEFR=="Inactive"),] %>%
group_by_(variable_y)%>% summarise(Inactive.count=sum(population),Inactive.sample=sum(sample))->inactive
list<-list(full_pop,in_employment,in_HSemployment, unemployed,inactive)
total_count<-Reduce(function(x, y) merge(x, y, all.x=T,
by=c(variable_y)), list, accumulate=F)
Graduate_breakdown<-data.frame( total_count[variable_y],
total_count$employed.count/total_count$total,
total_count$employed.sample,
total_count$HSemp.count/total_count$total,
total_count$HSemp.sample,
total_count$Unemp.count/(total_count$Unemp.count + total_count$employed.count),
total_count$Unemp.sample,
total_count$Inactive.count/total_count$total,
total_count$Inactive.sample)
colnames(Graduate_breakdown)<-c("Breakdown","Employment_rate","Employed_sample",
"High_skill_emp_rate","High_Skill_employed_sample",
"Unemployment_rate","Unemployed_sample",
"Inactivity_rate","Inactive_sample")
return(Graduate_breakdown)
}
#provide employment rates for the year broken down by characteristic
Graduate_breakdown<-function(quarter,year){
#select dataset: for example if quarter =1 and year =2016 this will get Q1_2016 and set this to 'dataset'
dataset<-get(paste0("Q",quarter,"_",year))
#reduce dataset to just graduates
dataset<-dataset[which(dataset$Graduate_Type=="Graduate"),]
Employment_counts_by(dataset,"AgeGroup")%>% variable_splits(.,"AgeGroup")->Age_group_rates
Age_group_rates$Population<-"AgeGroup"
Employment_counts_by(dataset,"SEX") %>% variable_splits(.,"SEX")->Gender_rates
Gender_rates$Population<-"SEX"
Employment_counts_by(dataset,"ETHNICITY") %>% variable_splits(.,"ETHNICITY")->Ethnicity_rates
Ethnicity_rates$Population<-"ETHNICITY"
Employment_counts_by(dataset,"DISEA") %>% variable_splits(.,"DISEA")->Disability_rates
Disability_rates$Population<-"DISEA"
Employment_counts_by(dataset,"DEGCLS7") %>% variable_splits(.,"DEGCLS7")->Degree_class_rates
Degree_class_rates$Population<-"DEGCLS7"
Employment_counts_by(dataset,"STEM") %>% variable_splits(.,"STEM")->STEM_rates
STEM_rates$Population<-"STEM"
Employment_counts_by(dataset,"LEM") %>% variable_splits(.,"LEM")->LEM_rates
LEM_rates$Population<-"LEM"
Employment_counts_by(dataset,"OSSAH") %>% variable_splits(.,"OSSAH")->OSSAH_rates
OSSAH_rates$Population<-"OSSAH"
Employment_counts_by(dataset,"GOVTOF2") %>% variable_splits(.,"GOVTOF2")->Region_rates
Region_rates$Population<-"GOVTOF2"
quarter<-rbind(Age_group_rates,Gender_rates,Ethnicity_rates, Disability_rates, Degree_class_rates,
STEM_rates, LEM_rates, OSSAH_rates,Region_rates)
quarter$Group<-"Graduates:16-64"
#reduce dataset to just young graduates
dataset<-dataset[which(dataset$Graduate_Type=="Graduate" & dataset$AgeGroup=="21-30"),]
Employment_counts_by(dataset,"SEX") %>% variable_splits(.,"SEX")->Gender_rates
Gender_rates$Population<-"SEX"
Employment_counts_by(dataset,"ETHNICITY") %>% variable_splits(.,"ETHNICITY")->Ethnicity_rates
Ethnicity_rates$Population<-"ETHNICITY"
Employment_counts_by(dataset,"DISEA") %>% variable_splits(.,"DISEA")->Disability_rates
Disability_rates$Population<-"DISEA"
Employment_counts_by(dataset,"DEGCLS7") %>% variable_splits(.,"DEGCLS7")->Degree_class_rates
Degree_class_rates$Population<-"DEGCLS7"
Employment_counts_by(dataset,"STEM") %>% variable_splits(.,"STEM")->STEM_rates
STEM_rates$Population<-"STEM"
Employment_counts_by(dataset,"LEM") %>% variable_splits(.,"LEM")->LEM_rates
LEM_rates$Population<-"LEM"
Employment_counts_by(dataset,"OSSAH") %>% variable_splits(.,"OSSAH")->OSSAH_rates
OSSAH_rates$Population<-"OSSAH"
Employment_counts_by(dataset,"GOVTOF2") %>% variable_splits(.,"GOVTOF2")->Region_rates
Region_rates$Population<-"GOVTOF2"
quarter2<-rbind(Age_group_rates,Gender_rates,Ethnicity_rates, Disability_rates, Degree_class_rates,
STEM_rates, LEM_rates, OSSAH_rates,Region_rates)
quarter2$Group<-"Graduates:21-30"
quarter<-rbind(quarter,quarter2)
return(quarter)
}
Graduate_breakdown_year<-function(year){
#select dataset: for example if quarter =1 and year =2016 this will get Q1_2016 and set this to 'dataset'
dataset1<-Graduate_breakdown(1,year)
dataset2<-Graduate_breakdown(2,year)
dataset3<-Graduate_breakdown(3,year)
dataset4<-Graduate_breakdown(4,year)
year_data<- rbind(dataset1,dataset2,dataset3,dataset4)
means<-ddply(year_data, .(Group,Population, Breakdown), colwise(mean, .(Employment_rate,
High_skill_emp_rate,
Unemployment_rate, Inactivity_rate),na.rm=TRUE))
samples<-ddply(year_data, .(Group,Population, Breakdown),colwise(sum, .(Employed_sample,
High_Skill_employed_sample,
Unemployed_sample, Inactive_sample),na.rm=TRUE))
output<-merge(x=means, y=samples, by=c("Group","Population","Breakdown"), all=TRUE)
output<-output[c("Group",
"Population",
"Breakdown",
"Employment_rate",
"Employed_sample",
"High_skill_emp_rate",
"High_Skill_employed_sample",
"Unemployment_rate",
"Unemployed_sample",
"Inactivity_rate",
"Inactive_sample")]
assign(paste0("Graduate_breakdown_",year),output,envir = globalenv())
write.csv(output,paste0("Outputs for GLMS/Graduate_breakdown_",year,".csv"))
}
#########################################################################################
# Average Salary by demographic #
#########################################################################################
#Find the median, lower and upper quartiles of salaries by demographic characteristic, and reshape (melt) data to have the characteristic as a variable rather than
#column name
#This uses a weighted average method to provide an unbiased estimate. (as in method 2 here: https://www.xycoon.com/quartiles.htm)
Average_salary_by<-function(dataset,demographic){
#find the weight variable
weight<-tail(sort(names(dataset[grep("PIWT", names(dataset))])),1)
dataset%>%
group_by_at(demographic)%>%
dplyr::summarise(p25=weighted.quantile(GRSSWK, !!sym(weight), probs=0.25),
median=weighted.median(GRSSWK,!!sym(weight)),
p90=weighted.quantile(GRSSWK, !!sym(weight), probs=0.90),
mean=weighted.mean(GRSSWK,!!sym(weight)),
sample=n(),
population=sum(!!sym(weight)))%>%
as.data.frame()%>%
mutate(variable=paste0(demographic, collapse = "_"))
#take a dataset; based on the demographic characeristic separate the dataet into groups; find the quartiles and median of GRSSWK (gross weekly pay) and weight on PIWT16
#reshape dataset for ease of output- turn the demographic into a variable rather than a column heading
}
Average_Salaries<-function(year){
#select dataset:
dataset1<-get(paste0("Q1_",year))
dataset2<-get(paste0("Q2_",year))
dataset3<-get(paste0("Q3_",year))
dataset4<-get(paste0("Q4_",year))
#rename weight variable
reformat_dataset<-function(dataset){
colnames<-c(tail(sort(names(dataset[grep("PIWT", names(dataset))])),1),
"Graduate_Type",
"AgeGroup",
"SEX",
"ILODEFR",
"FTPTWK",
"GRSSWK")
dataset<- dataset[,colnames]
weight_var<-tail(sort(names(dataset[grep("PIWT", names(dataset))])),1)
weight_index<-match(colnames(dataset),weight_var,0)
colnames(dataset)[colnames(dataset) %in% weight_var] <- c("PIWT")
return(dataset)
}
dataset1<-reformat_dataset(dataset1)
dataset2<-reformat_dataset(dataset2)
dataset3<-reformat_dataset(dataset3)
dataset4<-reformat_dataset(dataset4)
dataset<-rbind(dataset1, dataset2, dataset3, dataset4)
#Restrict to just those in full time employment
dataset<-dataset[which(dataset$FTPTWK=="Full-time" & dataset$ILODEFR=="In employment"),]
#find the weight variable for the dataset
weight<-tail(sort(names(dataset[grep("PIWT", names(dataset))])),1)
#restrict to just those with an income weight. Those with 0 weight wouldn't be included in the weighted median anyway, but removing at this stage speeds up computation.
dataset<-dataset[which(dataset[,weight]!=0),]
#################################################
# Gross Weekly pay in main job- #
#################################################
##Split by Age, Gender and Graduate Type
all_age_sex<-Average_salary_by(dataset,c("AgeGroup","SEX", "Graduate_Type"))
##Split by Age and Graduate Type
all_age<-Average_salary_by(dataset,c("AgeGroup", "Graduate_Type"))
#add extra variable to show this is for any gender
all_age$SEX<-"Total"
##Split by Gender and Graduate Type
all_sex<-Average_salary_by(dataset,c("SEX", "Graduate_Type"))
#add extra variable to show this is for any age
all_sex$AgeGroup<-"Total"
#Just split by graduate type (i.e. across all agebands)
#Use the function just defined above
all_total<- Average_salary_by(dataset,"Graduate_Type")
#Add in an 'agegroup' for the average weekly salaries by graduate type and rename the column headers: this way we can merge the datasets as they'll have the same headers
all_total$AgeGroup<-"Total"
all_total$SEX<-"All"
#append the average salaries for people by graduate type and ageband with those for all agebands.
all_sal<-rbind(all_age, all_total, all_sex , all_age_sex)
all_sal$year<-year
#rename the young_grad_sal dataset to reference the year and quarter it's run on and output to the global environment.
assign(paste0("Sal_",year),all_sal,envir = globalenv())
#Also write to an Excel file for ease of use by economics team.
#write.csv(all_sal,paste0("Outputs for GLMS/SAL_",year,".csv"))
}
#############################################################################################
# Restrict to just graduates: Graduate Breakdown salaries #
# Most recent year only
#############################################################################################
Graduate_breakdown_salaries<-function(year){
#select dataset: for example if quarter =1 and year =2016 this will get Q1_2016 and set this to 'dataset'
dataset1<-get(paste0("Q1_",year))
dataset2<-get(paste0("Q2_",year))
dataset3<-get(paste0("Q3_",year))
dataset4<-get(paste0("Q4_",year))
dataset<-rbind(dataset1, dataset2, dataset3, dataset4)
#Restrict to just those in full time employment
dataset<-dataset[which(dataset$FTPTWK=="Full-time" & dataset$ILODEFR=="In employment"),]
#find the weight variable for the dataset
weight<-tail(sort(names(dataset[grep("PIWT", names(dataset))])),1)
#restrict to just those with an income weight. Those with 0 weight wouldn't be included in the weighted median anyway, but removing at this stage speeds up computation.
dataset<-dataset[which(dataset[,weight]!=0),]
#restrict to just graduates
dataset<-dataset[which(dataset$Graduate_Type=="Graduate"),]
#Average salary by ageband
age<-Average_salary_by(dataset,"AgeGroup")
age<-dplyr::rename(age, Value=AgeGroup)
#Repeat for average salary by gender
sex<-Average_salary_by(dataset,"SEX")
sex<-dplyr::rename(sex, Value=SEX)
#Average salary by ethnicity
ethnicity<-Average_salary_by(dataset,"ETHNICITY")
ethnicity<-dplyr::rename(ethnicity, Value=ETHNICITY)
#Average salary by disability status
disability<-Average_salary_by(dataset,"DISEA")
disability<-dplyr::rename(disability, Value=DISEA)
#Average salary by degree classification
degreeclass<-Average_salary_by(dataset,"DEGCLS7")
degreeclass<-dplyr::rename(degreeclass, Value=DEGCLS7)
#Average salary by STEM/non-STEM graduates
stem<-Average_salary_by(dataset,"STEM")
stem<-dplyr::rename(stem, Value=STEM)
stem$Value<-as.character(stem$Value)
#Average salary by LEM/non-LEM graduates
lem<-Average_salary_by(dataset,"LEM")
lem<-dplyr::rename(lem, Value=LEM)
lem$Value<-as.character(lem$Value)
#Average salary by OSSAH/non-OSSAH graduates
ossah<-Average_salary_by(dataset,"OSSAH")
ossah<-dplyr::rename(ossah, Value=OSSAH)
ossah$Value<-as.character(ossah$Value)
#Average salary by occupational group
occupation<-Average_salary_by(dataset,"Occupation")
occupation<-dplyr::rename(occupation, Value=Occupation)
#Average salary by industry category
industry<-Average_salary_by(dataset,"INDE07M")
industry<-dplyr::rename(industry, Value=INDE07M)
#Average salary by region
region<-Average_salary_by(dataset,"GOVTOF2")
region<-dplyr::rename(region, Value=GOVTOF2)
#Average salary by Sex and industry category
industry_sex<-Average_salary_by(dataset,c("SEX","INDE07M"))
industry_sex<-dplyr::rename(industry_sex, Value=SEX,Value1=INDE07M)
#append datasets to each other to create one dataset of graduate salaries for different demographic characteristics
grad_sal<-bind_rows(list(age,
sex,
ethnicity,
disability,
degreeclass,
stem,
lem,
ossah,
occupation,
industry,
region,
industry_sex))
#rename the grad_sal dataset to reference the year and quarter it's run on and output to the global environment.
assign(paste0("grad_sal_",year),grad_sal,envir = globalenv())
#Also write to an Excel file for ease of use by economics team.
write.csv(grad_sal,paste0("Outputs for GLMS/GRAD_SAL_",year,".csv"))
#########################################################
# Repeat but just for young graduates(21-30) #
#########################################################
dataset<-dataset[which(dataset$Graduate_Type=="Graduate" & dataset$AgeGroup=="21-30"),]
#Average salary by ageband
age<-Average_salary_by(dataset,"AgeGroup")
age<-dplyr::rename(age, Value=AgeGroup)
#Repeat for average salary by gender
sex<-Average_salary_by(dataset,"SEX")
sex<-dplyr::rename(sex, Value=SEX)
#Average salary by ethnicity
ethnicity<-Average_salary_by(dataset,"ETHNICITY")
ethnicity<-dplyr::rename(ethnicity, Value=ETHNICITY)
#Average salary by disability status
disability<-Average_salary_by(dataset,"DISEA")
disability<-dplyr::rename(disability, Value=DISEA)
#Average salary by degree classification
degreeclass<-Average_salary_by(dataset,"DEGCLS7")
degreeclass<-dplyr::rename(degreeclass, Value=DEGCLS7)
#Average salary by STEM/non-STEM graduates
stem<-Average_salary_by(dataset,"STEM")
stem<-dplyr::rename(stem, Value=STEM)
stem$Value<-as.character(stem$Value)
#Average salary by LEM/non-LEM graduates
lem<-Average_salary_by(dataset,"LEM")
lem<-dplyr::rename(lem, Value=LEM)
lem$Value<-as.character(lem$Value)
#Average salary by OSSAH/non-OSSAH graduates
ossah<-Average_salary_by(dataset,"OSSAH")
ossah<-dplyr::rename(ossah, Value=OSSAH)
ossah$Value<-as.character(ossah$Value)
#Average salary by occupational group
occupation<-Average_salary_by(dataset,"Occupation")
occupation<-dplyr::rename(occupation, Value=Occupation)
#Average salary by industry category
industry<-Average_salary_by(dataset,"INDE07M")
industry<-dplyr::rename(industry, Value=INDE07M)
#Average salary by region
region<-Average_salary_by(dataset,"GOVTOF2")
region<-dplyr::rename(region, Value=GOVTOF2)
#Average salary by Sex and industry category
industry_sex<-Average_salary_by(dataset,c("SEX","INDE07M"))
industry_sex<-dplyr::rename(industry_sex, Value=SEX,Value1=INDE07M)
#append datasets to each other to create one dataset of graduate salaries for different demographic characteristics
young_grad_sal<-bind_rows(list(age,
sex,
ethnicity,
disability,
degreeclass,
stem,
lem,
ossah,
occupation,
industry,
region,
industry_sex))
#rename the young_grad_sal dataset to reference the year and quarter it's run on and output to the global environment.
assign(paste0("young_grad_sal_",year),young_grad_sal,envir = globalenv())
# write to an Excel file
write.csv(young_grad_sal,paste0("Outputs for GLMS/YOUNG_GRAD_SAL_",year,".csv"))
}
#######################################
#And confidence intervals
#######################################
# See link below for a discussion of accounting for survey design and weighting
# https://www.ons.gov.uk/methodology/methodologicalpublications/generalmethodology/onsworkingpaperseries/onsmethodologyworkingpaperseriesno9guidetocalculatingstandarderrorsforonssocialsurveys#accounting-for-the-estimation-method
#options(survey.lonely.psu="fail") #default option
options(survey.lonely.psu="adjust") #lonley PSUs make no contribution to the variance.
confidence_intervals<-function(quarter,year,young){
dataset_name<-paste0("Q",quarter,"_",year)
dataset<-get(dataset_name)
#find the weight variable
weight<-tail(sort(names(dataset[grep("PWT", names(dataset))])),1)
dataset<-dataset[which(dataset[,weight]>=0),]
#subest to 21-30s if young=TRUE
if(young==TRUE){dataset<-dataset[which(dataset$AgeGroup=="21-30"),]}
dataset$hhld_id<-substring(dataset$CASENO,1,13)
#define graduates, post graduates and non_graduates
dataset$Grad<-0
dataset$Grad[dataset$Graduate_Type=="Graduate"]<-1
dataset$PostGrad<-0
dataset$PostGrad[dataset$Graduate_Type=="PostGrad"]<-1
dataset$NonGrad<-0
dataset$NonGrad[dataset$Graduate_Type=="Non Grad"]<-1
#define employed graduates, post graduate and non graduates
dataset$Emp_Grad<-0
dataset$Emp_Grad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="Graduate"]<-1
dataset$Emp_PostGrad<-0
dataset$Emp_PostGrad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="PostGrad"]<-1
dataset$Emp_NonGrad<-0
dataset$Emp_NonGrad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="Non Grad"]<-1
#define graduates, post graduates and non graduates in high skilled employment
dataset$HSEmp_Grad<-0
dataset$HSEmp_Grad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="Graduate"&
dataset$SOCHE=="High Skilled Employment"]<-1
dataset$HSEmp_PostGrad<-0
dataset$HSEmp_PostGrad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="PostGrad"&
dataset$SOCHE=="High Skilled Employment"]<-1
dataset$HSEmp_NonGrad<-0
dataset$HSEmp_NonGrad[dataset$ILODEFR=="In employment" & dataset$Graduate_Type=="Non Grad"&
dataset$SOCHE=="High Skilled Employment"]<-1
#define Unemployed graduates, post graduate and non graduates
dataset$Unemp_Grad<-0
dataset$Unemp_Grad[dataset$ILODEFR=="ILO unemployed" & dataset$Graduate_Type=="Graduate"]<-1
dataset$Unemp_PostGrad<-0
dataset$Unemp_PostGrad[dataset$ILODEFR=="ILO unemployed" & dataset$Graduate_Type=="PostGrad"]<-1
dataset$Unemp_NonGrad<-0
dataset$Unemp_NonGrad[dataset$ILODEFR=="ILO unemployed" & dataset$Graduate_Type=="Non Grad"]<-1
#define economically active graduates, post graduate and non graduates
dataset$Act_Grad<-0
dataset$Act_Grad[(dataset$ILODEFR=="ILO unemployed"| dataset$ILODEFR=="In employment")
& dataset$Graduate_Type=="Graduate"]<-1
dataset$Act_PostGrad<-0
dataset$Act_PostGrad[(dataset$ILODEFR=="ILO unemployed"| dataset$ILODEFR=="In employment")
& dataset$Graduate_Type=="PostGrad"]<-1
dataset$Act_NonGrad<-0
dataset$Act_NonGrad[(dataset$ILODEFR=="ILO unemployed"| dataset$ILODEFR=="In employment")
& dataset$Graduate_Type=="Non Grad"]<-1
#dataset<<-dataset
#Define the survey design:
#clusters- households
#strata- low level area identifier (Unitary Local Authoritary)
#weight- person weight
weight_formula<-as.formula(paste0("~",weight))
surveydesign<-svydesign(id=hhld_id, data=dataset, strata=UALA, weight=weight_formula,nest=TRUE)
#surveydesign<<-surveydesign
#work out the standard error for graduates
#Note the programe saves the variance = SE^2
ratio_and_confidence_int<-function(Numerator_input, Demoninator_input){
Numerator<-as.formula(paste0("~",Numerator_input))
Denominator<-as.formula(paste0("~",Demoninator_input))
#calcualte mean and standard error
ratio_output<-svyratio(numerator=Numerator,denominator=Denominator, design = surveydesign)
#calcualte confidence intervals
confidence_int<-confint(ratio_output, level = 0.95,df =degf(surveydesign))
#reformat to dataframe
ratio_output<-data.frame(matrix(unlist(ratio_output),nrow=1, byrow=T),stringsAsFactors = FALSE)
confidence_int<-data.frame(matrix(unlist(confidence_int),nrow=1, byrow=T),stringsAsFactors = FALSE)
#We sqare root to get back to the standard error
ratio_output$X2<-sqrt(as.numeric(as.character(ratio_output$X2)))
#summarise as 1 dataframe
output<-data.frame(cbind(ratio_output$X1, confidence_int, ratio_output$X2),stringsAsFactors = FALSE)
numerator_type<-strsplit(Numerator_input,"_")[[1]][1]
names(output)<-c(paste0(numerator_type,"_prop"),paste0(numerator_type,"_prop_lower"),
paste0(numerator_type,"_prop_upper"),paste0(numerator_type,"_SE"))
output$population<-strsplit(Numerator_input,"_")[[1]][2]
output$Quarter<-dataset_name
return(output)
}
#graduates
grad_emp<-ratio_and_confidence_int("Emp_Grad","Grad")
grad_HSemp<-ratio_and_confidence_int("HSEmp_Grad","Grad")
grad_Unemp<-ratio_and_confidence_int("Unemp_Grad","Act_Grad")
list<-list(grad_emp, grad_HSemp, grad_Unemp)
grad<-Reduce(function(x, y) merge(x, y, all.x=T,
by=c("population","Quarter")), list, accumulate=F)
#postgraduates
PostGrad_emp<-ratio_and_confidence_int("Emp_PostGrad","PostGrad")
PostGrad_HSemp<-ratio_and_confidence_int("HSEmp_PostGrad","PostGrad")
PostGrad_Unemp<-ratio_and_confidence_int("Unemp_PostGrad","Act_PostGrad")
list<-list(PostGrad_emp, PostGrad_HSemp, PostGrad_Unemp)
PostGrad<-Reduce(function(x, y) merge(x, y, all.x=T,
by=c("population","Quarter")), list, accumulate=F)
#Non graduates
NonGrad_emp<-ratio_and_confidence_int("Emp_NonGrad","NonGrad")
NonGrad_HSemp<-ratio_and_confidence_int("HSEmp_NonGrad","NonGrad")
NonGrad_Unemp<-ratio_and_confidence_int("Unemp_NonGrad","Act_NonGrad")
list<-list(NonGrad_emp, NonGrad_HSemp, NonGrad_Unemp)
NonGrad<-Reduce(function(x, y) merge(x, y, all.x=T,
by=c("population","Quarter")), list, accumulate=F)
ratios<-rbind(grad,PostGrad,NonGrad)
#rename to keep in sync with Graduate_Types
ratios$population[ratios$population=="Grad"]<-"Graduate"
ratios$population[ratios$population=="NonGrad"]<-"Non Grad"
#Also want the sample and population sizes
dataset %>%
group_by(ILODEFR,Graduate_Type)%>%
summarise(count=n(),weighted_pop=sum(!!sym(weight))) %>%
gather(variable, value, -(Graduate_Type:ILODEFR)) %>%
unite(temp, ILODEFR, variable) %>%
spread(temp, value)->samples
dataset[which(dataset$SOCHE=="High Skilled Employment" & dataset$ILODEFR=="In employment"),] %>%
group_by(Graduate_Type)%>%
summarise(HS_emp_count=n(),HS_emp_weighted_pop=sum(!!sym(weight))) ->sample2
samples<-merge(x=samples, y=sample2, by=("Graduate_Type"))
colnames(samples)[colnames(samples)=="Graduate_Type"]<-"population"
ratios<-merge(x=ratios, y=samples, by=c("population"), all.x=TRUE)
return(ratios)
}
#Find confidence intervals from start year to end year
output<-lapply(start_year:end_year, function(y){lapply(1:4,function(x)confidence_intervals(x,y,FALSE))})
#store as dataframe
output2<-Reduce(rbind,lapply(1:length(output),function(x)Reduce(rbind, output[[x]])))
output2$year<-substring(output2$Quarter,4,7)
output2$Quarter<-substring(output2$Quarter,2,2)
output2<-output2[c("population","year",names(output2)[2:22])]
#sort by population and quarter
output3<-output2[with(output2, order(population, year, Quarter)), ]
output3$age<-"16-64"
#repeat for young_graduates
#Find confidence intervals from start year to end year
output<-lapply(start_year:end_year, function(y){lapply(1:4,function(x)confidence_intervals(x,y,TRUE))})
#store as dataframe
output2<-Reduce(rbind,lapply(1:length(output),function(x)Reduce(rbind, output[[x]])))
output2$year<-substring(output2$Quarter,4,7)
output2$Quarter<-substring(output2$Quarter,2,2)
output2<-output2[c("population","year",names(output2)[2:22])]
#sort by population and quarter
output4<-output2[with(output2, order(population, year, Quarter)), ]
output4$age<-"21-30"
output5<-rbind(output3,output4)
write.csv(output5,paste0("Outputs for GLMS/Emp_confidence_Intervals.csv"))
#######################################################
# 1. Import data #
#######################################################
start_year=2006 #Which year do you want to use LFS data from?
end_year=2018 #Which year do you want the LFS data up to?
#### OPTIONAL STEP###
#Convert all LFS data from Q1 of the start year to Q4 of the end year into R data format
#sapply(start_year:end_year, function(y)sapply(1:4,function(x)readLFS(x,y)))
#####################
#After Import/If you have the .rds files already run
sapply(start_year:end_year, function(y)sapply(1:4, function(x) read_LFS_from_project(x,y)))
#####################
#Older versions of HIQUAL15 don't have value labels,just numbers but we can use the value labels
#of Q1_2016 to recode the numbers as text strings
library(data.table)
HIQUAL15<-seq(1:85)
HIQUAL15_values<-levels(Q1_2016$HIQUAL15)
HIQUAL15_ref<-tibble(HIQUAL15_values,HIQUAL15)
Q1_2015<-left_join(Q1_2015,HIQUAL15_ref,by="HIQUAL15")
Q1_2015<-Q1_2015%>%select(-HIQUAL15)
setnames(Q1_2015,"HIQUAL15_values","HIQUAL15")
#####################
#Recode the LFS variables to those used in the publication
sapply(start_year:end_year, function(y)sapply(1:4, function(x) recode_variables(x,y)))
#Output employment rates
#Note be careful with the weights for this function.
#This outputs the proprotion in employment for each quarter. In Excel this needs to be combined to give the mean proportion in employment for the year.
#This reduces seasonality in the data
sapply(start_year:end_year, function(y)sapply(1:4, function(x) Employment_rate(x,y)))
#####################
#Create headline stats for each year (mean average of rates in each quarter)
sapply(start_year:end_year,function(x)year_average_employment_rate(x))
list<-lapply(start_year:end_year,function(x)get(paste0("Headline_",x)))
Yearly_employment<-Reduce(rbind,list)
write.csv(Yearly_employment,"Outputs for GLMS/Headline_employment_rates.csv")
#Create the graduate breakdown for the latest year
Graduate_breakdown_year(end_year)
#####################
#Output weekly salaries
#This uses the data for the whole year to output median salaries
sapply(start_year:end_year, function(y)Average_Salaries(y))
list<-lapply(start_year:end_year,function(x)get(paste0("Sal_",x)))
Yearly_salaries<-Reduce(rbind,list)
write.csv(Yearly_salaries,"Outputs for GLMS/Yearly_salaries.csv")
#And salaries for the graduate breakdowns
Graduate_breakdown_salaries(end_year)
#####################