I have 60 sample of data from various studies. All I get is a mean for x and y, and a sample size. Furthermore y is a percentage. Sometimes we get the standard error of the mean.
IIRC, percentages have strange distributions.
What is the correct way to handle the percentage and what is the correct way to deal with the weights?
I am using r's lm
function. Is this the proper tool?
related: What are the issues with using percentage outcome in linear regression? https://www.theanalysisfactor.com/proportions-as-dependent-variable-in-regression-which-type-of-model/ (almost all of the % data is between .2 and .8)
Trying different things, I get:
## [1] "linear regression Multiple R-squared: 0.729671\n"
## [1] "weighted linear regression Multiple R-squared: 0.775034\n"
## [1] "weighted linear regression Multiple R-squared with arcsine sqrt %: 0.749212\n"
## [1] "weighted linear regression Multiple R-squared with arcsine %: 0.772651\n"
Edit. Does it make any sense to not weight the data?
I am trying betareg but getting: In betareg.fit(X, Y, Z, weights, offset, link, link.phi, type, control) : no valid starting value for precision parameter found, using 1 instead
Does anyone know what this means or what I should do?
Edit 2: See the r markdown and code below.
Edit 4: This produces:
1 "unweighted"
1 "model: linear, r.squared: 0.727507"
1 "model: beta logit, pseudo.r.squared: 0.728726"
1 "model: beta loglog, pseudo.r.squared: 0.723229"
1 "weighted"
1 "model: linear, r.squared: 0.717221"
1 "model: beta logit, pseudo.r.squared: 0.728726"
1 "model: beta loglog, pseudo.r.squared: 0.723229"
1 "arcsinesqrt"
1 "model: linear, r.squared: 0.725157"
1 "arcsine"
1 "model: linear, r.squared: 0.701171"
It seems strange that the weighted and unweighted beta correlations are the same. Is this reasonable?
Edit 5: Seems like it's a bug.
R markdown (y.Rmd)
---
title: "Example of betareg usage"
author: "Ray Tayek"
date: "September 5, 2018"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
source("y.R")
```
#Overview
Trying to figure out how to do regression with percentages
```{r echo=T,results="hide",fig.keep="none"}
map<-run()
```
#Summary
## Correlations for the different cases
```{r}
printSummaries(map)
```
---
## regression lines with synthetic data **(ignore, it's a test)**
```{r, fig.width=6, fig.height=4 }
models<-map[["synthetic"]]
plot(df$x,df$y,ylim=c(0.,1.),cex=100*df$size)
myLines(models,df$x)
printSummary(models)
```
---
## FBG% vs FBG with real data (unweighted)
```{r, fig.width=6, fig.height=4 }
plot(t1$x,t1$y,ylim=c(0.,1.),cex=100*t1$size)
models<-map[["unweighted"]]
myLines(models,t1$x)
printSummary(models)
```
---
## FBG% vs FBG with real data (weighted)
```{r, fig.width=6, fig.height=4 }
plot(t1$x,t1$y,ylim=c(0.,1.),cex=100*t1$size)
models<-map[["weighted"]]
myLines(models,t1$x)
printSummary(models)
```
---
## FBG% vs FBG with arcsine sqrt transformation
```{r, fig.width=6, fig.height=4 }
t1$y2<-asin(sqrt(t1$y)) # doesn't stick!
plot(t1$x,t1$y2,ylim=c(0.,1.),cex=100*t1$size)
models<-map[["arcsinesqrt"]]
myLines(models,t1$x)
printSummary(models)
```
---
## FBG% vs FBG with arcsine transformation
```{r, fig.width=6, fig.height=4 }
t1$y3<-asin(t1$y)
plot(t1$x,t1$y3,ylim=c(0.,1.),cex=100*t1$size)
models<-map[["arcsine"]]
myLines(models,t1$x)
printSummary(models)
```
---
# Apendix
## Environment
```{r}
sessionInfo()
```
## R code
### y.R
```{r, code=readLines("y.R")}
```
R code: (y.R)
myColumNames=c("a","x","y","n")
modelLabels<-c("linear","beta logit","beta loglog")
colors<-c("black","red","green")
lineTypes=2:4
map<-list() # global variable!
t1<-NULL # global variable!
df<-NULL # global variable!
writeCopy<-function(t) {
write.table(t1,file='copy.txt')
}
readCopy<-function() {
df<-read.table("copy.txt")
print("using copy of real data")
return(df)
}
getExcelFile<-function(file) {
print("reading excel file.")
library(readxl)
t0<-read_excel(file)
start<-1
stop<-60
print(sprintf("start: %d: %f, stop: %d %f",start,t0$FBG[1],stop,t0$FBG[stop]))
t1<-t0[start:stop,5:8]
rm(t0)
colnames(t1)<-myColumNames
return(t1)
}
getData<-function(file) {
t1<-NULL
#t1<-getyExcelFile("D:/ray/dev/john2/dpa1c/A1c goal vs FBG Fig 1 aug 24 edits and A TO Z.xls")
#writeCopy(t1)
#if(file.exists("copy.txt")) t1<-(readCopy())
print("reading inline data.")
t1<-read.table(textConnection(input),header=TRUE)
t1$size<-t1$n/sum(t1$n)
return(t1)
}
checkData<-function(t1) {
print(sprintf("na's in a: %s",nas(t1$a)))
print(sprintf("na's in x: %s",nas(t1$x)))
print(sprintf("na's in y: %s",nas(t1$y)))
print(sprintf("na's in n %s",nas(t1$n)))
print(summary(t1$y))
print(sprintf("min percent at: %d, max percent at: %d",which.min(t1$y),which.max(t1$y)))
print(sprintf("min percent: %f, max percent: %f",t1$y[which.min(t1$y)],t1$y[which.max(t1$y)]))
}
input<-("
'a' 'x' 'y' 'n'
'1' 7.5 144 0.42 265
'2' 6.8 102.6 0.68 250
'3' 6.74 106 0.64 122
'4' 6.97 110 0.55 122
'5' 7.3 122 0.49 296
'6' 7.08 120.2 0.431 439
'7' 7.05 113.6 0.421 439
'8' 7.2 119 0.53 188
'9' 7.8 150.66 0.164 455
'10' 8.08 164.7 0.084 459
'11' 7.82 131.8 0.363 785
'12' 7 124 0.49 211
'13' 6.95 113.4 0.52 229
'14' 7.05 106.2 0.56 228
'15' 7.1 127.8 0.503 352
'16' 7.2 129.6 0.443 352
'17' 7.8 158.4 0.16 164
'18' 7.3 108 0.389 240
'19' 7.25 115 0.439 230
'20' 7.13 109.6 0.53 229
'21' 7.45 111.6 0.38 228
'22' 7.8 142 0.14 57
'23' 7.15 111.7 0.41 225
'24' 8 153.2 0.28 222
'25' 6.96 117 0.58 367
'26' 6.97 120 0.573 389
'27' 7.3 124.2 0.39 223
'28' 8.1 144 0.12 166
'29' 7.25 127.8 0.4 404
'30' 7.28 128.7 0.409 403
'31' 7.16 128.5 0.51 291
'32' 7.12 126 0.52 291
'33' 7.06 107.3 0.48 360
'34' 6.98 107 0.51 360
'35' 6.89 113.4 0.534 63
'36' 6.8 124.2 0.539 58
'37' 7.15 120 0.48 229
'38' 7 106.2 0.52 773
'39' 7.15 113 0.58 234
'40' 7.1 115.2 0.54 257
'41' 7.1 113.4 0.514 555
'42' 7 108 0.542 278
'43' 7.13 120 0.522 159
'44' 6.6 104 0.725 307
'45' 7.2 120 0.428 535
'46' 6.9 115 0.576 1003
'47' 7.1 119 0.447 213
'48' 6.8 113 0.663 428
'49' 7.17 110 0.44 196
'50' 6.92 104 0.57 192
'51' 7.8 160.2 0.27 146
'52' 7.8 158.4 0.27 125
'53' 7.9 145.8 0.243 1298
'54' 7.5 158 0.355 172
'55' NA 144.7 0.389 10
'56' 8.9 197 0.03 34
'57' 7.8 185 0.2187 32
'58' 8 165 0.1786 28
'59' 7.9 183 0.137 29
'60' 8 180 0.2 30
")
nas<-function(x) {
v<-as.list(which(is.na(x)))
s<-do.call(sprintf,c(v,list(fmt=paste(rep("#%d ",length(v)/1),collapse=" "))))
return(s)
}
printOneSummary<-function(label,model) {
if(is(model,"lm")) print(sprintf("model: %s, r.squared: %f",label,summary(model)$r.squared))
else print(sprintf("model: %s, pseudo.r.squared: %f",label,summary(model)$pseudo.r.squared))
}
printSummary<-function(models) {
for(i in 1:length(models))
printOneSummary(modelLabels[i],models[[i]])
}
printOneLongSummary<-function(label,model) {
print(sprintf("model: %s",Label))
print(summary(model))
}
printLongSummary<-function(models) {
for(i in 1:length(models))
printOneLongSummary(modelLabels[i],models[[i]])
}
# try this: new_DF <- DF[is.na(DF$Var),]
makeModel<-function(x,y,weights,weighted){
m<-NULL
if(weighted) m<-lm(y~x,na.action=na.omit,weights=weights)
else m<-lm(y~x,na.action=na.omit)
return(list(m))
}
myLines<-function(models,x) {
for(i in 1:length(models)) {
if(is(models[[i]],"lm"))
abline(models[[i]],col=colors[i],lty=lineTypes[i])
else lines(x,predict(models[[i]]),col=colors[i],lty=lineTypes[i])
}
if(length(models)>1)
legend(160,.9,legend=modelLabels,col=colors,lty=lineTypes,cex=0.8)
}
f1<-function(x,y,weights,weighted) { #unwrap these?
models<-makeModel(x,y,weights,weighted)
plot(x,y,ylim=c(0.,1.),cex=100*weights)
myLines(models,x)
return(models)
}
makeModels<-function(x,y,weights,weighted){
models <-NULL
if(weighted) {
m<-lm(y~x,na.action=na.omit,weights=weights)
logit<-betareg(y~x,na.action=na.omit,weights=weights)
loglog <- betareg(y~x,na.action=na.omit,weights=weights,link = "loglog")
models<-list(m,logit,loglog)
} else {
m<-lm(y~x,na.action=na.omit)
logit<-betareg(y~x,na.action=na.omit)
loglog<-betareg(y~x,na.action=na.omit,link="loglog")
models<-list(m,logit,loglog)
}
}
f<-function(x,y,weights,weighted) {
models<-makeModels(x,y,weights,weighted)
plot(x,y,ylim=c(0.,1.),cex=100*weights) # leave in for debugging in console
myLines(models,x)
return(models)
}
getSyntheticData<-function() {
x=seq(100,200,length.out=60)
y=1-(.1+.8*sqrt((x-100)/100))
df<-data.frame(x,y)
df$n<-1
df$size<-df$n/sum(df$n)
return(df)
}
getRealData<-function() {
return(getData(T))
}
initialize<-function() {
if(!require(betareg)){install.packages("betareg")}
set.seed(123)
}
printSummaries<-function(map) {
for (name in names(map)) {
print(name)
printSummary(map[[name]])
}
}
run<-function() {
initialize()
df<<-getSyntheticData()
checkData(df)
summary(df$y)
print("synthetic")
models<-f(df$x,df$y,df$size,F)
print(length(models))
map<-c(map,synthetic=list(models))
#if(T) return(map)
t1<<-getRealData()
checkData(t1)
print("unweighted")
models<-f(t1$x,t1$y,t1$size,F)
print(length(models))
map<-c(map,unweighted=list(models))
print("weighted")
models<-f(t1$x,t1$y,t1$size,T)
print(length(models))
map<-c(map,weighted=list(models))
t1$y2<-asin(sqrt(t1$y))
print("arcsinesqrt")
models<-f1(t1$x,t1$y2,t1$size,T)
print(length(models))
map<-c(map,arcsinesqrt=list(models))
t1$y3<-asin(t1$y)
print("arcsine")
models<-f1(t1$x,t1$y3,t1$size,T)
print(length(models))
map<-c(map,arcsine=list(models))
printSummaries(map)
print("after printSummaries")
return(map)
}