# install.packages("rJava")
# install.packages("openNLPmodels.en", repos <- "http://datacube.wu.ac.at", lib = "D:/angel/Documents/R/win-library/3.6")
# install.packages("NLP")
# install.packages("openNLP")
# install.packages("tm")
# install.packages("dplyr")
# install.packages("stringr")
# Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jdk1.8.0_231\\jre') # for 64-bit version
library(rJava)
.jinit(parameters <- "-Xmx4g")
library(NLP)
library(openNLP)
library(openNLPmodels.en)
library(tm)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
get_annotations_from_document <- function(doc) {
x <- as.String(doc)
sent_token_annotator <- Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
pos_tag_annotator <- Maxent_POS_Tag_Annotator()
gc()
y1 <- NLP::annotate(x, list(sent_token_annotator, word_token_annotator))
y2 <- NLP::annotate(x, pos_tag_annotator, y1)
parse_annotator <- Parse_Annotator()
y3 <- NLP::annotate(x, parse_annotator, y2)
return(y3)
}
get_annotated_merged_document <- function(doc, annotations) {
x <- as.String(doc)
y2w <- subset(annotations, type == "word")
tags <- sapply(y2w$features, '[[', "POS")
r1 <- sprintf("%s/%s", x[y2w], tags)
r2 <- paste(r1, collapse = " ")
return(r2)
}
get_annotated_plain_text_document <- function(doc, annotations) {
x <- as.String(doc)
a <- AnnotatedPlainTextDocument(x, annotations)
return(a)
}
## Returns the pattern detected on an AnnotatedPlainTextDocument.
detect_pattern_on_document <- function(doc, pattern) {
x <- as.String(doc)
res <- str_match_all(x, pattern)
dimrow <- dim(res[[1]])[1]
dimcol <- dim(res[[1]])[2]
# If there are no rows, no matches have been found
if (dimrow == 0) {
return(NA)
}else {
if (dimcol > 2) {
# If there are three or more columns, we have to paste all the groups together
for (i in 1:dimrow) {
res[[1]][i, 2] <- paste(res[[1]][i, 2:dimcol], collapse = ' ')
}
}
# We return all the results found separated by ','
if (dimcol != 1) {
result <- paste(res[[1]][, 2], collapse = ', ')
}else {
result <- paste(res[[1]][, 1], collapse = ', ')
}
return(result)
}
}
## Returns the pattern detected on an AnnotatedPlainTextDocument with some context.
detect_pattern_on_document_with_context <- function(doc, pattern) {
txt <- as.String(doc)
number <- 50
coord <- str_locate(txt, pattern)
res3 <- substr(txt, coord[1] - number, coord[2] + number)
return(res3)
}
## Returns a data frame with all the patterns detected in a corpus.
detect_patterns_in_corpus <- function(corpus, patterns) {
vall_entities <- data.frame(matrix(NA, ncol = length(patterns) + 1,
nrow = length(corpus)))
names(vall_entities) <- c("File", patterns)
for (i in seq_along(patterns)) {
vall_entities[, i + 1] = unlist(lapply(corpus, detect_pattern_on_document,
pattern = patterns[i]))
}
for (i in seq_along(corpus)) {
vall_entities$File[i] = meta(corpus[[i]])$id
}
return(vall_entities)
}
## Returns a data frame with all the patterns detected in an annotated corpus.
detect_patterns_in_tagged_corpus <- function(corpus, taggedCorpus, patterns) {
vall_entities <- data.frame(matrix(NA, ncol = length(patterns) + 1,
nrow = length(corpus)))
names(vall_entities) <- c("File", patterns)
for (i in seq_along(patterns)) {
vall_entities[, i + 1] = unlist(lapply(taggedCorpus, detect_pattern_on_document,
pattern = patterns[i]))
}
for (i in seq_along(corpus)) {
vall_entities$File[i] = meta(corpus[[i]])$id
}
return(vall_entities)
}
## Counts the number of columns with non-NA values for each pattern.
count_matches_per_column <- function(df) {
entity_count_per_pattern <- data.frame(matrix(NA, ncol = 2,
nrow = length(names(df)) - 1))
names(entity_count_per_pattern) <- c("Entity", "Count")
for (i in 2:length(names(df))) {
entity_count_per_pattern$Entity[i - 1] = names(df)[i]
entity_count_per_pattern$Count[i - 1] = nrow(subset(df, !is.na(df[i])))
}
return(entity_count_per_pattern)
}
## Counts the number of rows with non-NA values for each file.
count_matches_per_row <- function(df) {
entity_count_per_file <- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
names(entity_count_per_file) <- c("File", "Count")
for (i in seq_len(nrow(df))) {
entity_count_per_file$File[i] = df$File[i]
entity_count_per_file$Count[i] = length(Filter(Negate(is.na), df[i, 2:length(df[i,])]))
}
return(entity_count_per_file[entityCountPerFile[2] != 0,])
}
## Prints the matches found per pattern.
print_matches_per_pattern <- function(patterns, matches) {
for (i in seq_along(patterns)) {
print(paste("PATTERN: ", patterns[i]))
strings <- matches[, i + 1][!is.na(unlist(matches[, i + 1]))]
print(strings)
print(" ")
}
}
## Returns a data frame with all the files and their matches in a single list per file.
merge_all_matches_in_lists <- function(df) {
matches_per_file <- rep(list(list()), nrow(df))
for (i in seq_len(nrow(df))) {
matches <- list()
for (j in 2:ncol(df)) {
if (grepl(',', df[i, j])) {
b <- strsplit(as.character(df[i, j]), split = ',')
for (j in seq_along(b[[1]])) {
matches <- c(matches, str_trim(b[[1]][j]))
}
}else {
if (!(is.na(df[i, j]))) {
matches <- c(matches, str_trim(df[i, j]))
}
}
}
matches <- unique(matches)
matches_per_file[[i]] <- append(matches_per_file[[i]], matches)
}
files <- df[, 1]
matches <- matches_per_file
all_matches <- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
names(all_matches) <- c("Files", "Matches")
all_matches$Files = files
all_matches$Matches = matches
return(all_matches)
}
## Returns a data frame with all the files and the gold standard matches in a single list per file.
merge_gold_standard_in_lists <- function(df) {
matches_per_file <- rep(list(list()), nrow(df))
for (i in seq_len(nrow(df))) {
matches <- as.list(unlist(Filter(Negate(is.na), df[i, 2:length(df)])))
matches_per_file[[i]] <- append(matches_per_file[[i]], matches)
}
files <- df[, 1]
matches <- matches_per_file
all_matches <- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
names(all_matches) <- c("Files", "Matches")
all_matches$Files = files
all_matches$Matches = matches
return(all_matches)
}
## Calculates precision, recall and f-measure according to a gold standard.
calculate_metrics <- function(matches, matches.gs) {
metrics <- data.frame(matrix(NA, ncol = 3, nrow = 1))
names(metrics) <- c("Precision", "Recall", "Fmeasure")
num_correct <- 0
all_answers <- 0
possible_answers <- 0
for (i in seq_len(nrow(matches))) {
if (length(matches.gs$Matches[[i]]) != 0) {
l <- str_trim(unlist(matches[i, 2]))
l_gs <- unname(unlist(matches.gs[i, 2]))
intersection <- intersect(l, l_gs)
num_correct <- num_correct + length(intersection)
all_answers <- all_answers + length(l)
possible_answers <- possible_answers + length(l_gs)
}
}
metrics$Precision = num_correct / all_answers
metrics$Recall = num_correct / possible_answers
beta <- 1
if ((metrics$Precision == 0) & (metrics$Recall == 0)) {
metrics$Fmeasure = 0
} else {
metrics$Fmeasure = ((sqrt(beta) + 1) *
metrics$Precision *
metrics$Recall) /
((sqrt(beta) * metrics$Precision) + metrics$Recall)
}
return(metrics)
}
## Test with a reduced corpus due Java issues
corpus_source <- DirSource("../data/pos-reduced", encoding = "UTF-8")
corpus <- Corpus(corpus_source)
corpus_annotations <- lapply(corpus, get_annotations_from_document)
head(corpus_annotations[[1]])
## id type start end features
## 1 sentence 1 265 constituents=<<integer,54>>, parse=<<character,1>>
## 2 sentence 268 439 constituents=<<integer,36>>, parse=<<character,1>>
## 3 sentence 442 591 constituents=<<integer,27>>, parse=<<character,1>>
## 4 sentence 594 797 constituents=<<integer,44>>, parse=<<character,1>>
## 5 sentence 800 939 constituents=<<integer,28>>, parse=<<character,1>>
## 6 sentence 942 1299 constituents=<<integer,70>>, parse=<<character,1>>
tail(corpus_annotations[[1]])
## id type start end features
## 844 word 4189 4197 POS=NN
## 845 word 4199 4199 POS=,
## 846 word 4201 4208 POS=NN
## 847 word 4210 4212 POS=CC
## 848 word 4214 4217 POS=NN
## 849 word 4219 4225 POS=NN
corpus_tagged <- Map(get_annotated_plain_text_document, corpus, corpus_annotations)
corpus_tagged[[1]]
## <<AnnotatedPlainTextDocument>>
## Metadata: 0
## Annotations: length: 849
## Content: chars: 4226
corpus_tagged_text <- Map(get_annotated_merged_document, corpus, corpus_annotations)
corpus_tagged_text[[1]]
## [1] "films/NNS adapted/VBD from/IN comic/JJ books/NNS have/VBP had/VBN plenty/NN of/IN success/NN ,/, whether/IN they/PRP 're/VBP about/IN superheroes/NNS (/-LRB- batman/NN ,/, superman/NN ,/, spawn/NN )/-RRB- ,/, or/CC geared/VBN toward/IN kids/NNS (/-LRB- casper/NN )/-RRB- or/CC the/DT arthouse/NN crowd/NN (/-LRB- ghost/NN world/NN )/-RRB- ,/, but/CC there/EX 's/VBZ never/RB really/RB been/VBN a/DT comic/JJ book/NN like/IN from/IN hell/NN before/IN ./. for/IN starters/NNS ,/, it/PRP was/VBD created/VBN by/IN alan/NN moore/NN (/-LRB- and/CC eddie/JJ campbell/NN )/-RRB- ,/, who/WP brought/VBD the/DT medium/NN to/TO a/DT whole/JJ new/JJ level/NN in/IN the/DT mid/JJ '80s/NNS with/IN a/DT 12-part/JJ series/NN called/VBN the/DT watchmen/NNS ./. to/TO say/VB moore/NN and/CC campbell/NN thoroughly/RB researched/VBD the/DT subject/NN of/IN jack/NN the/DT ripper/NN would/MD be/VB like/IN saying/VBG michael/NN jackson/NN is/VBZ starting/VBG to/TO look/VB a/DT little/JJ odd/JJ ./. the/DT book/NN (/-LRB- or/CC \"/`` graphic/JJ novel/NN ,/, \"/`` if/IN you/PRP will/MD )/-RRB- is/VBZ over/IN 500/CD pages/NNS long/RB and/CC includes/VBZ nearly/RB 30/CD more/RBR that/IN consist/VB of/IN nothing/NN but/CC footnotes/NNS ./. in/IN other/JJ words/NNS ,/, do/VBP n't/RB dismiss/VB this/DT film/NN because/IN of/IN its/PRP$ source/NN ./. if/IN you/PRP can/MD get/VB past/IN the/DT whole/JJ comic/JJ book/NN thing/NN ,/, you/PRP might/MD find/VB another/DT stumbling/JJ block/NN in/IN from/IN hell/NN 's/POS directors/NNS ,/, albert/NN and/CC allen/JJ hughes/NNS ./. getting/VBG the/DT hughes/NNS brothers/NNS to/TO direct/VB this/DT seems/VBZ almost/RB as/RB ludicrous/JJ as/IN casting/VBG carrot/NN top/NN in/IN ,/, well/RB ,/, anything/NN ,/, but/CC riddle/VB me/PRP this/DT :/: who/WP better/RB to/TO direct/VB a/DT film/NN that/WDT 's/VBZ set/VBN in/IN the/DT ghetto/NN and/CC features/NNS really/RB violent/JJ street/NN crime/NN than/IN the/DT mad/JJ geniuses/NNS behind/IN menace/NN ii/NNS society/NN ?/. the/DT ghetto/NN in/IN question/NN is/VBZ ,/, of/IN course/NN ,/, whitechapel/NN in/IN 1888/CD london/. 's/POS east/JJ end/NN ./. it/PRP 's/VBZ a/DT filthy/JJ ,/, sooty/JJ place/NN where/WRB the/DT whores/NNS (/-LRB- called/VBN \"/`` unfortunates/JJ \"/'' )/-RRB- are/VBP starting/VBG to/TO get/VB a/DT little/JJ nervous/JJ about/IN this/DT mysterious/JJ psychopath/NN who/WP has/VBZ been/VBN carving/VBG through/IN their/PRP$ profession/NN with/IN surgical/JJ precision/NN ./. when/WRB the/DT first/JJ stiff/NN turns/VBZ up/RP ,/, copper/NN peter/NN godley/NN (/-LRB- robbie/NN coltrane/NN ,/, the/DT world/NN is/VBZ not/RB enough/JJ )/-RRB- calls/VBZ in/IN inspector/NN frederick/NN abberline/NN (/-LRB- johnny/JJ depp/NN ,/, blow/NN )/-RRB- to/TO crack/VB the/DT case/NN ./. abberline/NN ,/, a/DT widower/NN ,/, has/VBZ prophetic/JJ dreams/NNS he/PRP unsuccessfully/RB tries/VBZ to/TO quell/VB with/IN copious/JJ amounts/NNS of/IN absinthe/NNS and/CC opium/NN ./. upon/IN arriving/VBG in/IN whitechapel/NN ,/, he/PRP befriends/VBZ an/DT unfortunate/NN named/VBN mary/JJ kelly/NN (/-LRB- heather/NN graham/NN ,/, say/VBP it/PRP is/VBZ n't/RB so/RB )/-RRB- and/CC proceeds/NNS to/TO investigate/VB the/DT horribly/RB gruesome/JJ crimes/NNS that/IN even/RB the/DT police/NN surgeon/NN ca/MD n't/RB stomach/VB ./. i/PRP do/VBP n't/RB think/VB anyone/NN needs/NNS to/TO be/VB briefed/VBN on/IN jack/NN the/DT ripper/NN ,/, so/IN i/PRP wo/MD n't/RB go/VB into/IN the/DT particulars/NNS here/RB ,/, other/JJ than/IN to/TO say/VB moore/NN and/CC campbell/NN have/VBP a/DT unique/JJ and/CC interesting/JJ theory/NN about/IN both/DT the/DT identity/NN of/IN the/DT killer/NN and/CC the/DT reasons/NNS he/PRP chooses/VBZ to/TO slay/VB ./. in/IN the/DT comic/JJ ,/, they/PRP do/VBP n't/RB bother/VB cloaking/VBG the/DT identity/NN of/IN the/DT ripper/NN ,/, but/CC screenwriters/NNS terry/NN hayes/NNS (/-LRB- vertical/JJ limit/NN )/-RRB- and/CC rafael/JJ yglesias/NNS (/-LRB- les/NNS mis/NN ?/. rables/NNS )/-RRB- do/VBP a/DT good/JJ job/NN of/IN keeping/VBG him/PRP hidden/VBN from/IN viewers/NNS until/IN the/DT very/JJ end/NN ./. it/PRP 's/VBZ funny/JJ to/TO watch/VB the/DT locals/NNS blindly/RB point/VBP the/DT finger/NN of/IN blame/NN at/IN jews/NNS and/CC indians/NNS because/IN ,/, after/IN all/DT ,/, an/DT englishman/NN could/MD never/RB be/VB capable/JJ of/IN committing/VBG such/JJ ghastly/JJ acts/NNS ./. and/CC from/IN hell/NN 's/POS ending/NN had/VBD me/PRP whistling/VBG the/DT stonecutters/NNS song/NN from/IN the/DT simpsons/NNS for/IN days/NNS (/-LRB- \"/'' who/WP holds/VBZ back/RB the/DT electric/JJ car/who/NN made/VBD steve/JJ guttenberg/NN a/DT star/NN ?/. \"/`` )/-RRB- ./. do/VBP n't/RB worry/VB -/: it/PRP 'll/MD all/DT make/VB sense/NN when/WRB you/PRP see/VBP it/PRP ./. now/RB onto/IN from/IN hell/NN 's/POS appearance/NN :/: it/PRP 's/VBZ certainly/RB dark/JJ and/CC bleak/JJ enough/JJ ,/, and/CC it/PRP 's/VBZ surprising/JJ to/TO see/VB how/WRB much/RB more/JJR it/PRP looks/VBZ like/IN a/DT tim/JJ burton/NN film/NN than/IN planet/NN of/IN the/DT apes/NNS did/VBD (/-LRB- at/IN times/NNS ,/, it/PRP seems/VBZ like/IN sleepy/JJ hollow/JJ 2/CD )/-RRB- ./. the/DT print/NN i/NN saw/VBD was/VBD n't/RB completely/RB finished/VBN (/-LRB- both/DT color/NN and/CC music/NN had/VBD not/RB been/VBN finalized/VBN ,/, so/IN no/DT comments/NNS about/IN marilyn/JJ manson/NN )/-RRB- ,/, but/CC cinematographer/NN peter/NN deming/NN (/-LRB- do/VBP n't/RB say/VB a/DT word/NN )/-RRB- ably/RB captures/VBZ the/DT dreariness/NN of/IN victorian-era/NN london/RB and/CC helped/VBD make/VB the/DT flashy/JJ killing/NN scenes/NNS remind/VBD me/PRP of/IN the/DT crazy/JJ flashbacks/NNS in/IN twin/JJ peaks/NNS ,/, even/RB though/IN the/DT violence/NN in/IN the/DT film/NN pales/NNS in/IN comparison/NN to/TO that/DT in/IN the/DT black-and-white/JJ comic/JJ ./. oscar/NN winner/NN martin/VBG childs/NNS '/POS (/-LRB- shakespeare/NN in/IN love/NN )/-RRB- production/NN design/NN turns/VBZ the/DT original/JJ prague/NN surroundings/NNS into/IN one/CD creepy/JJ place/NN ./. even/RB the/DT acting/VBG in/IN from/IN hell/NN is/VBZ solid/JJ ,/, with/IN the/DT dreamy/JJ depp/NN turning/VBG in/IN a/DT typically/RB strong/JJ performance/NN and/CC deftly/RB handling/VBG a/DT british/JJ accent/NN ./. ians/NNS holm/VBP (/-LRB- joe/NN gould/NN 's/POS secret/NN )/-RRB- and/CC richardson/NN (/-LRB- 102/CD dalmatians/NNS )/-RRB- log/VBP in/IN great/JJ supporting/VBG roles/NNS ,/, but/CC the/DT big/JJ surprise/NN here/RB is/VBZ graham/NN ./. i/NN cringed/VBD the/DT first/JJ time/NN she/PRP opened/VBD her/PRP$ mouth/NN ,/, imagining/VBG her/PRP$ attempt/NN at/IN an/DT irish/JJ accent/NN ,/, but/CC it/PRP actually/RB was/VBD n't/RB half/DT bad/JJ ./. the/DT film/NN ,/, however/RB ,/, is/VBZ all/DT good/JJ ./. 2/CD :/: 00/CD -/: r/NN for/IN strong/JJ violence/gore/NN ,/, sexuality/NN ,/, language/NN and/CC drug/NN content/NN"
patterns <- "created/VBN by/IN ([A-z]*)/NN ([A-z]*)/NN"
patterns <- c(patterns, "created/VBN by/IN [A-z]*/NN [A-z]*/NN \\(/-LRB- and/CC ([A-z]*)/JJ ([A-z]*)/NN")
patterns <- c(patterns, "screenwriter[s]?/NN[S]? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)")
patterns <- c(patterns, "cinematographer/NN(?: ,/,)? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/NN[S]?")
patterns <- c(patterns, "cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/IN ([A-z]*)/NN")
patterns <- c(patterns, "oscar/NN winner/NN ([A-z]*)/VBG ([A-z]*)/NNS")
patterns <- c(patterns, "screenwriters/NNS [A-z]*/(?:NN[S]?|JJ) [A-z]*/(?:NN[S]?|JJ) \\(/-LRB- [A-z /]* \\)/-RRB- and/CC ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN|JJ)") # catch second screen writer
patterns <- c(patterns, "([A-z]*)/NN ([A-z]*)/NN is/VBZ [A-z]*/VBG") ## name surnname is verb-ing ....
patterns <- c(patterns, "([A-z]*)/NN ([A-z]*)/NN and/CC [A-z]*/NN[S]?") # name surname and name
patterns <- c(patterns, "([A-z]*)/NN and/CC [A-z]*/JJ [A-z]*/NN[S]?") # name and name surname
patterns <- c(patterns, "a/DT ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ) film/NN") # a name surname film
patterns <- c(patterns, "to/TO say/VB ([A-z]*)/NN and/CC [A-z]*/NN") # to say (name) and name
patterns <- c(patterns, "to/TO say/VB [A-z]*/NN and/CC ([A-z]*)/NN") # to say name and (name)
patterns <- c(patterns, "named/VBN ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)") # named name surname
patterns <- c(patterns, "about/IN ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)") # about name surname
patterns <- c(patterns, "made/VBD ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ) a/DT star/NN") # about name surname
patterns <- c(patterns, "([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ|VBP) \\(/-LRB- [A-z]*/(?:NN[S]?|JJ) [A-z]*/(?:NN[S]?|JJ)") # name surname (name surname)
patterns <- c(patterns, "\\(/-LRB- ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)") # (name surname)
# patterns <- c(patterns, "[A-z]*/NN and/CC ([A-z]*)/NN") name and name
# patterns <- c(patterns, "([A-z]*)/NN ([A-z]*)/NN is|was/VBZ|VBD [A-z]*/VBG") # name surname is|was verb-ing
found_entities <- detect_patterns_in_tagged_corpus(corpus, corpus_tagged_text, patterns)
print_matches_per_pattern(patterns, found_entities)
## [1] "PATTERN: created/VBN by/IN ([A-z]*)/NN ([A-z]*)/NN"
## [1] "alan moore"
## [1] " "
## [1] "PATTERN: created/VBN by/IN [A-z]*/NN [A-z]*/NN \\(/-LRB- and/CC ([A-z]*)/JJ ([A-z]*)/NN"
## [1] "eddie campbell"
## [1] " "
## [1] "PATTERN: screenwriter[s]?/NN[S]? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)"
## [1] "terry hayes"
## [1] " "
## [1] "PATTERN: cinematographer/NN(?: ,/,)? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/NN[S]?"
## [1] "peter deming"
## [1] " "
## [1] "PATTERN: cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/IN ([A-z]*)/NN"
## logical(0)
## [1] " "
## [1] "PATTERN: oscar/NN winner/NN ([A-z]*)/VBG ([A-z]*)/NNS"
## [1] "martin childs"
## [1] " "
## [1] "PATTERN: screenwriters/NNS [A-z]*/(?:NN[S]?|JJ) [A-z]*/(?:NN[S]?|JJ) \\(/-LRB- [A-z /]* \\)/-RRB- and/CC ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN|JJ)"
## [1] "rafael yglesias"
## [1] " "
## [1] "PATTERN: ([A-z]*)/NN ([A-z]*)/NN is/VBZ [A-z]*/VBG"
## [1] "michael jackson"
## [1] " "
## [1] "PATTERN: ([A-z]*)/NN ([A-z]*)/NN and/CC [A-z]*/NN[S]?"
## [1] "matthew broderick" "book chain" "bruce lee"
## [1] " "
## [1] "PATTERN: ([A-z]*)/NN and/CC [A-z]*/JJ [A-z]*/NN[S]?"
## [1] "albert"
## [1] " "
## [1] "PATTERN: a/DT ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ) film/NN"
## [1] "tim burton"
## [1] " "
## [1] "PATTERN: to/TO say/VB ([A-z]*)/NN and/CC [A-z]*/NN"
## [1] "moore, moore"
## [1] " "
## [1] "PATTERN: to/TO say/VB [A-z]*/NN and/CC ([A-z]*)/NN"
## [1] "campbell, campbell"
## [1] " "
## [1] "PATTERN: named/VBN ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)"
## [1] "mary kelly"
## [1] " "
## [1] "PATTERN: about/IN ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)"
## [1] "marilyn manson"
## [1] " "
## [1] "PATTERN: made/VBD ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ) a/DT star/NN"
## [1] "steve guttenberg"
## [1] " "
## [1] "PATTERN: ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ|VBP) \\(/-LRB- [A-z]*/(?:NN[S]?|JJ) [A-z]*/(?:NN[S]?|JJ)"
## [1] "arthouse crowd, peter godley, frederick abberline, mary kelly, terry hayes, rafael yglesias, ians holm"
## [2] "tracy flick, tracy flick, max fischer, _election _, _rushmore _, individual screenplay"
## [3] "martin brody, larry vaughn, matt hooper"
## [4] "moise tschombe"
## [1] " "
## [1] "PATTERN: \\(/-LRB- ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)"
## [1] "ghost world, robbie coltrane, johnny depp, heather graham, vertical limit, les mis, joe gould"
## [2] "reese witherspoon, matthew broderick, _election _, _rushmore _, matthew broderick, bill murray, _rushmore _, _election _"
## [3] "roy scheider, murray hamilton, richard dreyfuss, robert shaw"
## [4] "eriq ebouaney, maka kotto, pascal nzonzi"
## [1] " "
write.table(found_entities, file = "results/found_entities.csv", row.names = F, na = "", sep = ";")
matches_list <- merge_all_matches_in_lists(found_entities)
head(found_entities)
## File created/VBN by/IN ([A-z]*)/NN ([A-z]*)/NN
## 1 cv000_29590.txt alan moore
## 2 cv001_18431.txt <NA>
## 3 cv002_15918.txt <NA>
## 4 cv003_11664.txt <NA>
## 5 cv004_11636.txt <NA>
## 6 cv005_29443.txt <NA>
## created/VBN by/IN [A-z]*/NN [A-z]*/NN \\(/-LRB- and/CC ([A-z]*)/JJ ([A-z]*)/NN
## 1 eddie campbell
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## screenwriter[s]?/NN[S]? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)
## 1 terry hayes
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## cinematographer/NN(?: ,/,)? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/NN[S]?
## 1 peter deming
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/IN ([A-z]*)/NN
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## oscar/NN winner/NN ([A-z]*)/VBG ([A-z]*)/NNS
## 1 martin childs
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## screenwriters/NNS [A-z]*/(?:NN[S]?|JJ) [A-z]*/(?:NN[S]?|JJ) \\(/-LRB- [A-z /]* \\)/-RRB- and/CC ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN|JJ)
## 1 rafael yglesias
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## ([A-z]*)/NN ([A-z]*)/NN is/VBZ [A-z]*/VBG
## 1 michael jackson
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## ([A-z]*)/NN ([A-z]*)/NN and/CC [A-z]*/NN[S]?
## 1 <NA>
## 2 matthew broderick
## 3 book chain
## 4 <NA>
## 5 bruce lee
## 6 <NA>
## ([A-z]*)/NN and/CC [A-z]*/JJ [A-z]*/NN[S]?
## 1 albert
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## a/DT ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ) film/NN
## 1 tim burton
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## to/TO say/VB ([A-z]*)/NN and/CC [A-z]*/NN
## 1 moore, moore
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## to/TO say/VB [A-z]*/NN and/CC ([A-z]*)/NN
## 1 campbell, campbell
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## named/VBN ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)
## 1 mary kelly
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## about/IN ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)
## 1 marilyn manson
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## made/VBD ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ) a/DT star/NN
## 1 steve guttenberg
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ|VBP) \\(/-LRB- [A-z]*/(?:NN[S]?|JJ) [A-z]*/(?:NN[S]?|JJ)
## 1 arthouse crowd, peter godley, frederick abberline, mary kelly, terry hayes, rafael yglesias, ians holm
## 2 tracy flick, tracy flick, max fischer, _election _, _rushmore _, individual screenplay
## 3 <NA>
## 4 martin brody, larry vaughn, matt hooper
## 5 <NA>
## 6 moise tschombe
## \\(/-LRB- ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)
## 1 ghost world, robbie coltrane, johnny depp, heather graham, vertical limit, les mis, joe gould
## 2 reese witherspoon, matthew broderick, _election _, _rushmore _, matthew broderick, bill murray, _rushmore _, _election _
## 3 <NA>
## 4 roy scheider, murray hamilton, richard dreyfuss, robert shaw
## 5 <NA>
## 6 eriq ebouaney, maka kotto, pascal nzonzi
gold_standard <- read.table(file = "../data/goldStandard.csv", quote = "", na.strings = "",
colClasses = "character", sep = ";")
gold_standard_matches_list <- merge_gold_standard_in_lists(gold_standard)
head(gold_standard_matches_list)
## Files
## 1 cv000_29590.txt
## 2 cv001_18431.txt
## 3 cv002_15918.txt
## 4 cv003_11664.txt
## 5 cv004_11636.txt
## 6 cv005_29443.txt
## Matches
## 1 alan moore, eddie campbell, moore, campbell, jack, michael jackson, albert, allen hughes, peter godley, robbie coltrane, frederick abberline, johnny depp, abberline, mary kelly, heather graham, terry hayes, rafael yglesias, steve guttenberg, tim burton, marilyn manson, peter deming, martin childs, depp, ians holm, joe gould, richardson, graham
## 2 matthew broderick, reese witherspoon, george washington carver, tracy flick, paul, max fischer, bill murray, broderick, witherspoon, jessica campbell, tammy, rooney, campbell, alexander payne, tracy, m
## 3 ryan, hanks, tom hanks, joe fox, meg ryan, kathleen kelley, fox, kelley
## 4 john williams, steven spielberg, spielberg, williams, martin brody, roy scheider, larry vaughn, murray hamilton, brody, matt hooper, richard dreyfuss, hooper, vaughn, quint, robert shaw, hitchcock, scheider, dreyfuss, shaw, robert redford, paul newman, duddy kravitz, ahab
## 5 herb, jackie chan, barry sanders, sanders, jackie, chan, bruce lee, tim allen, lawrence kazdan, john williams, spielberg, george lucas
## 6 raoul peck, lumumba, patrice lumumba, eriq ebouaney, helmer peck, peck, pascal bonitzer, patrice, joseph kasa vubu, maka kotto, moise tschombe, pascal nzonzi
## MY MATCHES
print(unlist(matches_list$Matches[[2]]))
## [1] "matthew broderick" "tracy flick" "max fischer"
## [4] "_election _" "_rushmore _" "individual screenplay"
## [7] "reese witherspoon" "bill murray"
## GOLD STANDARD MATCHES
print(unlist(gold_standard_matches_list$Matches[[2]]))
## V2 V3
## "matthew broderick" "reese witherspoon"
## V4 V5
## "george washington carver" "tracy flick"
## V6 V7
## "paul" "max fischer"
## V8 V9
## "bill murray" "broderick"
## V10 V11
## "witherspoon" "jessica campbell"
## V12 V13
## "tammy" "rooney"
## V14 V15
## "campbell" "alexander payne"
## V16 V17
## "tracy" "m"
metrics <- calculate_metrics(matches_list, gold_standard_matches_list)
## Show metrics
metrics
## Precision Recall Fmeasure
## 1 0.826087 0.3877551 0.5277778
In this exercise I tried to obtain a higher recall by sacrificing a bit of accuracy, obtaining a great recall of around 0.4 in the files I tested the patterns on and an accuracy of 0.8. The great result is that the F measure is very high, which indicates a good performance.
In order to do so, I did a research in random files to study where the names usually appeared. I found out one of the most common places was between parenthesis, in order to explain a reference. Hence, that is where I found the most significant patterns to increase the recall.
I would have liked to have a more powerful PC to run the metrics over the whole corpus, but I could only select a reduced dataset, as my computer had java memory issues otherwise.