Skip to content

Commit

Permalink
Switched over to file.path()
Browse files Browse the repository at this point in the history
  • Loading branch information
johnmyleswhite committed Feb 17, 2012
1 parent 261991d commit b8e9690
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 37 deletions.
8 changes: 4 additions & 4 deletions 01-Introduction/ufo_sightings.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ library('ggplot2') # We'll use ggplot2 for all of our visualizations
# We also have to alter two defaults; first, we want the strings to not be converted to
# factor types; and, this data has does not have header labels in the first row, so
# we want to keep the first row as data.
ufo <- read.delim("data/ufo/ufo_awesome.tsv",
ufo <- read.delim(file.path("data", "ufo", "ufo_awesome.tsv"),
sep = "\t",
stringsAsFactors = FALSE,
header = FALSE,
Expand Down Expand Up @@ -125,7 +125,7 @@ quick.hist <- ggplot(ufo.us, aes(x = DateOccurred)) +
geom_histogram() +
scale_x_date(major = "50 years")
ggsave(plot = quick.hist,
filename = 'images/quick_hist.pdf',
filename = file.path("images", "quick_hist.pdf"),
height = 6,
width = 8)

Expand All @@ -138,7 +138,7 @@ new.hist <- ggplot(ufo.us, aes(x = DateOccurred)) +
geom_histogram() +
scale_x_date(major = "50 years")
ggsave(plot = quick.hist,
filename = "images/new_hist.pdf",
filename = file.path("images", "new_hist.pdf"),
height = 6,
width = 8)

Expand Down Expand Up @@ -208,6 +208,6 @@ state.plot <- ggplot(all.sightings, aes(x = YearMonth,y = Sightings)) +
opts(title = "Number of UFO sightings by Month-Year and U.S. State (1990-2010)")
# Save the plot as a PDF
ggsave(plot = state.plot,
filename = "images/ufo_sightings.pdf",
filename = file.path("images", "ufo_sightings.pdf"),
width = 14,
height = 8.5)
48 changes: 23 additions & 25 deletions 03-Classification/email_classify.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ library('tm')
library('ggplot2')

# Set the global paths
spam.path <- "data/spam/"
spam2.path <- "data/spam_2/"
easyham.path <- "data/easy_ham/"
easyham2.path <- "data/easy_ham_2/"
hardham.path <- "data/hard_ham/"
hardham2.path <- "data/hard_ham_2/"
spam.path <- file.path("data", "spam")
spam2.path <- file.path("data", "spam_2")
easyham.path <- file.path("data", "easy_ham")
easyham2.path <- file.path("data", "easy_ham_2")
hardham.path <- file.path("data", "hard_ham")
hardham2.path <- file.path("data", "hard_ham_2")

# Create motivating plot
x <- runif(1000, 0, 40)
Expand All @@ -50,7 +50,7 @@ ex1 <- ggplot(val, aes(x, V2)) +
xlab("X") +
ylab("Y")
ggsave(plot = ex1,
filename = "images/00_Ex1.pdf",
filename = file.path("images", "00_Ex1.pdf"),
height = 10,
width = 10)

Expand Down Expand Up @@ -137,7 +137,7 @@ classify.email <- function(path, training.df, prior = 0.5, c = 1e-6)
spam.docs <- dir(spam.path)
spam.docs <- spam.docs[which(spam.docs != "cmds")]
all.spam <- sapply(spam.docs,
function(p) get.msg(paste(spam.path, p, sep = "")))
function(p) get.msg(file.path(spam.path, p)))

# Create a DocumentTermMatrix from that vector
spam.tdm <- get.tdm(all.spam)
Expand Down Expand Up @@ -166,7 +166,7 @@ spam.df <- transform(spam.df,
easyham.docs <- dir(easyham.path)
easyham.docs <- easyham.docs[which(easyham.docs != "cmds")]
all.easyham <- sapply(easyham.docs[1:length(spam.docs)],
function(p) get.msg(paste(easyham.path, p, sep = "")))
function(p) get.msg(file.path(easyham.path, p)))

easyham.tdm <- get.tdm(all.easyham)

Expand All @@ -193,12 +193,10 @@ hardham.docs <- dir(hardham.path)
hardham.docs <- hardham.docs[which(hardham.docs != "cmds")]

hardham.spamtest <- sapply(hardham.docs,
function(p) classify.email(paste(hardham.path, p, sep = ""),
training.df = spam.df))
function(p) classify.email(file.path(hardham.path, p), training.df = spam.df))

hardham.hamtest <- sapply(hardham.docs,
function(p) classify.email(paste(hardham.path, p, sep = ""),
training.df = easyham.df))
function(p) classify.email(file.path(hardham.path, p), training.df = easyham.df))

hardham.res <- ifelse(hardham.spamtest > hardham.hamtest,
TRUE,
Expand All @@ -207,15 +205,15 @@ summary(hardham.res)

# Find counts of just terms 'html' and 'table' in all SPAM and EASYHAM docs, and create figure
html.spam <- sapply(spam.docs,
function(p) count.word(paste(spam.path, p, sep = ""), "html"))
function(p) count.word(file.path(spam.path, p), "html"))
table.spam <- sapply(spam.docs,
function(p) count.word(paste(spam.path, p, sep = ""), "table"))
function(p) count.word(file.path(spam.path, p), "table"))
spam.init <- cbind(html.spam, table.spam, "SPAM")

html.easyham <- sapply(easyham.docs,
function(p) count.word(paste(easyham.path, p, sep = ""), "html"))
function(p) count.word(file.path(easyham.path, p), "html"))
table.easyham <- sapply(easyham.docs,
function(p) count.word(paste(easyham.path, p, sep = ""), "table"))
function(p) count.word(file.path(easyham.path, p), "table"))
easyham.init <- cbind(html.easyham, table.easyham, "EASYHAM")

init.df <- data.frame(rbind(spam.init, easyham.init),
Expand All @@ -233,7 +231,7 @@ init.plot1 <- ggplot(init.df, aes(x = html, y = table)) +
stat_abline(yintersept = 0, slope = 1) +
theme_bw()
ggsave(plot = init.plot1,
filename = "images/01_init_plot1.pdf",
filename = file.path("images", "01_init_plot1.pdf"),
width = 10,
height = 10)

Expand All @@ -245,7 +243,7 @@ init.plot2 <- ggplot(init.df, aes(x = html, y = table)) +
stat_abline(yintersept = 0, slope = 1) +
theme_bw()
ggsave(plot = init.plot2,
filename = "images/02_init_plot2.pdf",
filename = file.path("images", "02_init_plot2.pdf"),
width = 10,
height = 10)

Expand All @@ -272,17 +270,17 @@ spam2.docs <- spam2.docs[which(spam2.docs != "cmds")]
easyham2.class <- suppressWarnings(lapply(easyham2.docs,
function(p)
{
spam.classifier(paste(easyham2.path, p, sep = ""))
spam.classifier(file.path(easyham2.path, p))
}))
hardham2.class <- suppressWarnings(lapply(hardham2.docs,
function(p)
{
spam.classifier(paste(hardham2.path, p, sep = ""))
spam.classifier(file.path(hardham2.path, p))
}))
spam2.class <- suppressWarnings(lapply(spam2.docs,
function(p)
{
spam.classifier(paste(spam2.path,p,sep = ""))
spam.classifier(file.path(spam2.path, p))
}))

# Create a single, final, data frame with all of the classification data in it
Expand Down Expand Up @@ -319,7 +317,7 @@ class.plot <- ggplot(class.df, aes(x = Pr.HAM, Pr.SPAM)) +
theme_bw() +
opts(axis.text.x = theme_blank(), axis.text.y = theme_blank())
ggsave(plot = class.plot,
filename = "images/03_final_classification.pdf",
filename = file.path("images", "03_final_classification.pdf"),
height = 10,
width = 10)

Expand All @@ -340,5 +338,5 @@ colnames(class.res) <- c("NOT SPAM", "SPAM")
print(class.res)

# Save the training data for use in Chapter 4
write.csv(spam.df, "data/spam_df.csv", row.names = FALSE)
write.csv(easyham.df, "data/easyham_df.csv", row.names = FALSE)
write.csv(spam.df, file.path("data", "spam_df.csv"), row.names = FALSE)
write.csv(easyham.df, file.path("data", "easyham_df.csv"), row.names = FALSE)
16 changes: 8 additions & 8 deletions 04-Ranking/priority_inbox.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ library('tm')
library('ggplot2')

# Set the global paths
data.path <- "../03-Classification/data/"
easyham.path <- paste(data.path, "easy_ham/", sep = "")
data.path <- file.path("..", "03-Classification", "data")
easyham.path <- file.path(data.path, "easy_ham")

# We define a set of function that will extract the data
# for the feature set we have defined to rank email
Expand Down Expand Up @@ -106,7 +106,7 @@ parse.email <- function(path)
easyham.docs <- dir(easyham.path)
easyham.docs <- easyham.docs[which(easyham.docs != "cmds")]
easyham.parse <- lapply(easyham.docs,
function(p) parse.email(paste(easyham.path, p, sep = "")))
function(p) parse.email(file.path(easyham.path, p)))

# Convert raw data from list to data frame
ehparse.matrix <- do.call(rbind, easyham.parse)
Expand Down Expand Up @@ -168,7 +168,7 @@ from.scales <- ggplot(from.ex) +
theme_bw() +
opts(axis.text.y = theme_text(size = 5, hjust = 1))
ggsave(plot = from.scales,
filename = "images/0011_from_scales.pdf",
filename = file.path("images", "0011_from_scales.pdf"),
height = 4.8,
width = 7)

Expand All @@ -190,7 +190,7 @@ from.rescaled <- ggplot(from.weight, aes(x = 1:nrow(from.weight))) +
theme_bw() +
opts(axis.text.y = theme_blank(), axis.text.x = theme_blank())
ggsave(plot = from.rescaled,
filename = "images/0012_from_rescaled.pdf",
filename = file.path("images", "0012_from_rescaled.pdf"),
height = 4.8,
width = 7)

Expand Down Expand Up @@ -426,7 +426,7 @@ threshold.plot <- ggplot(train.ranks.df, aes(x = Rank)) +
scale_fill_manual(values = c("darkred" = "darkred"), legend = FALSE) +
theme_bw()
ggsave(plot = threshold.plot,
filename = "images/01_threshold_plot.pdf",
filename = file.path("images", "01_threshold_plot.pdf"),
height = 4.7,
width = 7)

Expand All @@ -448,7 +448,7 @@ final.df$Date <- date.converter(final.df$Date, pattern1, pattern2)
final.df <- final.df[rev(with(final.df, order(Date))), ]

# Save final data set and plot results.
write.csv(final.df, "data/final_df.csv", row.names = FALSE)
write.csv(final.df, file.path("data", "final_df.csv"), row.names = FALSE)

testing.plot <- ggplot(subset(final.df, Type == "TRAINING"), aes(x = Rank)) +
stat_density(aes(fill = Type, alpha = 0.65)) +
Expand All @@ -459,6 +459,6 @@ testing.plot <- ggplot(subset(final.df, Type == "TRAINING"), aes(x = Rank)) +
scale_fill_manual(values = c("TRAINING" = "darkred", "TESTING" = "darkblue")) +
theme_bw()
ggsave(plot = testing.plot,
filename = "images/02_testing_plot.pdf",
filename = file.path("images", "02_testing_plot.pdf"),
height = 4.7,
width = 7)

0 comments on commit b8e9690

Please sign in to comment.