Intelligent Systems Course

Author: Angel Igareta angel@igareta.com

Packages —-

# install.packages("rJava")
# install.packages("openNLPmodels.en", repos <- "http://datacube.wu.ac.at", lib = "D:/angel/Documents/R/win-library/3.6")
# install.packages("NLP")
# install.packages("openNLP")
# install.packages("tm")
# install.packages("dplyr")
# install.packages("stringr")
# Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jdk1.8.0_231\\jre') # for 64-bit version

library(rJava)
.jinit(parameters <- "-Xmx4g")
library(NLP)
library(openNLP)
library(openNLPmodels.en)
library(tm)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

Utils. Source: Raúl García-Castro R-Pub —-

get_annotations_from_document <- function(doc) {
  x <- as.String(doc)
  sent_token_annotator <- Maxent_Sent_Token_Annotator()
  word_token_annotator <- Maxent_Word_Token_Annotator()
  pos_tag_annotator <- Maxent_POS_Tag_Annotator()
  gc()
  y1 <- NLP::annotate(x, list(sent_token_annotator, word_token_annotator))
  y2 <- NLP::annotate(x, pos_tag_annotator, y1)
  parse_annotator <- Parse_Annotator()
  y3 <- NLP::annotate(x, parse_annotator, y2)
  return(y3)
}

get_annotated_merged_document <- function(doc, annotations) {
  x <- as.String(doc)
  y2w <- subset(annotations, type == "word")
  tags <- sapply(y2w$features, '[[', "POS")
  r1 <- sprintf("%s/%s", x[y2w], tags)
  r2 <- paste(r1, collapse = " ")
  return(r2)
}

get_annotated_plain_text_document <- function(doc, annotations) {
  x <- as.String(doc)
  a <- AnnotatedPlainTextDocument(x, annotations)
  return(a)
}

## Returns the pattern detected on an AnnotatedPlainTextDocument.
detect_pattern_on_document <- function(doc, pattern) {
  x <- as.String(doc)
  res <- str_match_all(x, pattern)

  dimrow <- dim(res[[1]])[1]
  dimcol <- dim(res[[1]])[2]

  # If there are no rows, no matches have been found
  if (dimrow == 0) {
    return(NA)
  }else {
    if (dimcol > 2) {
      # If there are three or more columns, we have to paste all the groups together
      for (i in 1:dimrow) {
        res[[1]][i, 2] <- paste(res[[1]][i, 2:dimcol], collapse = ' ')
      }
    }

    # We return all the results found separated by ','
    if (dimcol != 1) {
      result <- paste(res[[1]][, 2], collapse = ', ')
    }else {
      result <- paste(res[[1]][, 1], collapse = ', ')
    }
    return(result)
  }
}

## Returns the pattern detected on an AnnotatedPlainTextDocument with some context.
detect_pattern_on_document_with_context <- function(doc, pattern) {
  txt <- as.String(doc)
  number <- 50
  coord <- str_locate(txt, pattern)
  res3 <- substr(txt, coord[1] - number, coord[2] + number)
  return(res3)
}

## Returns a data frame with all the patterns detected in a corpus.
detect_patterns_in_corpus <- function(corpus, patterns) {
  vall_entities <- data.frame(matrix(NA, ncol = length(patterns) + 1,
                                     nrow = length(corpus)))
  names(vall_entities) <- c("File", patterns)
  for (i in seq_along(patterns)) {
    vall_entities[, i + 1] = unlist(lapply(corpus, detect_pattern_on_document,
                                           pattern = patterns[i]))
  }
  for (i in seq_along(corpus)) {
    vall_entities$File[i] = meta(corpus[[i]])$id
  }
  return(vall_entities)
}


## Returns a data frame with all the patterns detected in an annotated corpus.
detect_patterns_in_tagged_corpus <- function(corpus, taggedCorpus, patterns) {
  vall_entities <- data.frame(matrix(NA, ncol = length(patterns) + 1,
                                     nrow = length(corpus)))
  names(vall_entities) <- c("File", patterns)
  for (i in seq_along(patterns)) {
    vall_entities[, i + 1] = unlist(lapply(taggedCorpus, detect_pattern_on_document,
                                           pattern = patterns[i]))
  }
  for (i in seq_along(corpus)) {
    vall_entities$File[i] = meta(corpus[[i]])$id
  }
  return(vall_entities)
}

## Counts the number of columns with non-NA values for each pattern.
count_matches_per_column <- function(df) {
  entity_count_per_pattern <- data.frame(matrix(NA, ncol = 2,
                                                nrow = length(names(df)) - 1))
  names(entity_count_per_pattern) <- c("Entity", "Count")

  for (i in 2:length(names(df))) {
    entity_count_per_pattern$Entity[i - 1] = names(df)[i]
    entity_count_per_pattern$Count[i - 1] = nrow(subset(df, !is.na(df[i])))
  }
  return(entity_count_per_pattern)
}

## Counts the number of rows with non-NA values for each file.
count_matches_per_row <- function(df) {
  entity_count_per_file <- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(entity_count_per_file) <- c("File", "Count")

  for (i in seq_len(nrow(df))) {
    entity_count_per_file$File[i] = df$File[i]
    entity_count_per_file$Count[i] = length(Filter(Negate(is.na), df[i, 2:length(df[i,])]))
  }
  return(entity_count_per_file[entityCountPerFile[2] != 0,])
}

## Prints the matches found per pattern.
print_matches_per_pattern <- function(patterns, matches) {
  for (i in seq_along(patterns)) {
    print(paste("PATTERN: ", patterns[i]))
    strings <- matches[, i + 1][!is.na(unlist(matches[, i + 1]))]
    print(strings)
    print(" ")
  }
}

## Returns a data frame with all the files and their matches in a single list per file.
merge_all_matches_in_lists <- function(df) {
  matches_per_file <- rep(list(list()), nrow(df))
  for (i in seq_len(nrow(df))) {
    matches <- list()
    for (j in 2:ncol(df)) {
      if (grepl(',', df[i, j])) {
        b <- strsplit(as.character(df[i, j]), split = ',')
        for (j in seq_along(b[[1]])) {
          matches <- c(matches, str_trim(b[[1]][j]))
        }
      }else {
        if (!(is.na(df[i, j]))) {
          matches <- c(matches, str_trim(df[i, j]))
        }
      }
    }
    matches <- unique(matches)
    matches_per_file[[i]] <- append(matches_per_file[[i]], matches)
  }

  files <- df[, 1]
  matches <- matches_per_file

  all_matches <- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(all_matches) <- c("Files", "Matches")

  all_matches$Files = files
  all_matches$Matches = matches

  return(all_matches)
}

## Returns a data frame with all the files and the gold standard matches in a single list per file.
merge_gold_standard_in_lists <- function(df) {
  matches_per_file <- rep(list(list()), nrow(df))

  for (i in seq_len(nrow(df))) {
    matches <- as.list(unlist(Filter(Negate(is.na), df[i, 2:length(df)])))
    matches_per_file[[i]] <- append(matches_per_file[[i]], matches)
  }

  files <- df[, 1]
  matches <- matches_per_file

  all_matches <- data.frame(matrix(NA, ncol = 2, nrow = nrow(df)))
  names(all_matches) <- c("Files", "Matches")

  all_matches$Files = files
  all_matches$Matches = matches

  return(all_matches)
}

## Calculates precision, recall and f-measure according to a gold standard.
calculate_metrics <- function(matches, matches.gs) {
  metrics <- data.frame(matrix(NA, ncol = 3, nrow = 1))
  names(metrics) <- c("Precision", "Recall", "Fmeasure")

  num_correct <- 0
  all_answers <- 0
  possible_answers <- 0

  for (i in seq_len(nrow(matches))) {
    if (length(matches.gs$Matches[[i]]) != 0) {
      l <- str_trim(unlist(matches[i, 2]))
      l_gs <- unname(unlist(matches.gs[i, 2]))
      intersection <- intersect(l, l_gs)
      num_correct <- num_correct + length(intersection)
      all_answers <- all_answers + length(l)
      possible_answers <- possible_answers + length(l_gs)
    }
  }

  metrics$Precision = num_correct / all_answers
  metrics$Recall = num_correct / possible_answers

  beta <- 1
  if ((metrics$Precision == 0) & (metrics$Recall == 0)) {
    metrics$Fmeasure = 0
  } else {
    metrics$Fmeasure = ((sqrt(beta) + 1) *
      metrics$Precision *
      metrics$Recall) /
      ((sqrt(beta) * metrics$Precision) + metrics$Recall)
  }

  return(metrics)
}

Load corpus —-

## Test with a reduced corpus due Java issues
corpus_source <- DirSource("../data/pos-reduced", encoding = "UTF-8")
corpus <- Corpus(corpus_source)

Annotation —-

corpus_annotations <- lapply(corpus, get_annotations_from_document)

Show annotations sentences and words

head(corpus_annotations[[1]])
##  id type     start end  features
##   1 sentence     1  265 constituents=<<integer,54>>, parse=<<character,1>>
##   2 sentence   268  439 constituents=<<integer,36>>, parse=<<character,1>>
##   3 sentence   442  591 constituents=<<integer,27>>, parse=<<character,1>>
##   4 sentence   594  797 constituents=<<integer,44>>, parse=<<character,1>>
##   5 sentence   800  939 constituents=<<integer,28>>, parse=<<character,1>>
##   6 sentence   942 1299 constituents=<<integer,70>>, parse=<<character,1>>
tail(corpus_annotations[[1]])
##  id  type start end  features
##  844 word  4189 4197 POS=NN
##  845 word  4199 4199 POS=,
##  846 word  4201 4208 POS=NN
##  847 word  4210 4212 POS=CC
##  848 word  4214 4217 POS=NN
##  849 word  4219 4225 POS=NN

Attach the annotations to the document and store the annotated corpus in another variable

corpus_tagged <- Map(get_annotated_plain_text_document, corpus, corpus_annotations)
corpus_tagged[[1]]
## <<AnnotatedPlainTextDocument>>
## Metadata:  0
## Annotations:  length: 849
## Content:  chars: 4226

Store all the annotations inline with the text and store the annotated corpus in another variable

corpus_tagged_text <- Map(get_annotated_merged_document, corpus, corpus_annotations)
corpus_tagged_text[[1]]
## [1] "films/NNS adapted/VBD from/IN comic/JJ books/NNS have/VBP had/VBN plenty/NN of/IN success/NN ,/, whether/IN they/PRP 're/VBP about/IN superheroes/NNS (/-LRB- batman/NN ,/, superman/NN ,/, spawn/NN )/-RRB- ,/, or/CC geared/VBN toward/IN kids/NNS (/-LRB- casper/NN )/-RRB- or/CC the/DT arthouse/NN crowd/NN (/-LRB- ghost/NN world/NN )/-RRB- ,/, but/CC there/EX 's/VBZ never/RB really/RB been/VBN a/DT comic/JJ book/NN like/IN from/IN hell/NN before/IN ./. for/IN starters/NNS ,/, it/PRP was/VBD created/VBN by/IN alan/NN moore/NN (/-LRB- and/CC eddie/JJ campbell/NN )/-RRB- ,/, who/WP brought/VBD the/DT medium/NN to/TO a/DT whole/JJ new/JJ level/NN in/IN the/DT mid/JJ '80s/NNS with/IN a/DT 12-part/JJ series/NN called/VBN the/DT watchmen/NNS ./. to/TO say/VB moore/NN and/CC campbell/NN thoroughly/RB researched/VBD the/DT subject/NN of/IN jack/NN the/DT ripper/NN would/MD be/VB like/IN saying/VBG michael/NN jackson/NN is/VBZ starting/VBG to/TO look/VB a/DT little/JJ odd/JJ ./. the/DT book/NN (/-LRB- or/CC \"/`` graphic/JJ novel/NN ,/, \"/`` if/IN you/PRP will/MD )/-RRB- is/VBZ over/IN 500/CD pages/NNS long/RB and/CC includes/VBZ nearly/RB 30/CD more/RBR that/IN consist/VB of/IN nothing/NN but/CC footnotes/NNS ./. in/IN other/JJ words/NNS ,/, do/VBP n't/RB dismiss/VB this/DT film/NN because/IN of/IN its/PRP$ source/NN ./. if/IN you/PRP can/MD get/VB past/IN the/DT whole/JJ comic/JJ book/NN thing/NN ,/, you/PRP might/MD find/VB another/DT stumbling/JJ block/NN in/IN from/IN hell/NN 's/POS directors/NNS ,/, albert/NN and/CC allen/JJ hughes/NNS ./. getting/VBG the/DT hughes/NNS brothers/NNS to/TO direct/VB this/DT seems/VBZ almost/RB as/RB ludicrous/JJ as/IN casting/VBG carrot/NN top/NN in/IN ,/, well/RB ,/, anything/NN ,/, but/CC riddle/VB me/PRP this/DT :/: who/WP better/RB to/TO direct/VB a/DT film/NN that/WDT 's/VBZ set/VBN in/IN the/DT ghetto/NN and/CC features/NNS really/RB violent/JJ street/NN crime/NN than/IN the/DT mad/JJ geniuses/NNS behind/IN menace/NN ii/NNS society/NN ?/. the/DT ghetto/NN in/IN question/NN is/VBZ ,/, of/IN course/NN ,/, whitechapel/NN in/IN 1888/CD london/. 's/POS east/JJ end/NN ./. it/PRP 's/VBZ a/DT filthy/JJ ,/, sooty/JJ place/NN where/WRB the/DT whores/NNS (/-LRB- called/VBN \"/`` unfortunates/JJ \"/'' )/-RRB- are/VBP starting/VBG to/TO get/VB a/DT little/JJ nervous/JJ about/IN this/DT mysterious/JJ psychopath/NN who/WP has/VBZ been/VBN carving/VBG through/IN their/PRP$ profession/NN with/IN surgical/JJ precision/NN ./. when/WRB the/DT first/JJ stiff/NN turns/VBZ up/RP ,/, copper/NN peter/NN godley/NN (/-LRB- robbie/NN coltrane/NN ,/, the/DT world/NN is/VBZ not/RB enough/JJ )/-RRB- calls/VBZ in/IN inspector/NN frederick/NN abberline/NN (/-LRB- johnny/JJ depp/NN ,/, blow/NN )/-RRB- to/TO crack/VB the/DT case/NN ./. abberline/NN ,/, a/DT widower/NN ,/, has/VBZ prophetic/JJ dreams/NNS he/PRP unsuccessfully/RB tries/VBZ to/TO quell/VB with/IN copious/JJ amounts/NNS of/IN absinthe/NNS and/CC opium/NN ./. upon/IN arriving/VBG in/IN whitechapel/NN ,/, he/PRP befriends/VBZ an/DT unfortunate/NN named/VBN mary/JJ kelly/NN (/-LRB- heather/NN graham/NN ,/, say/VBP it/PRP is/VBZ n't/RB so/RB )/-RRB- and/CC proceeds/NNS to/TO investigate/VB the/DT horribly/RB gruesome/JJ crimes/NNS that/IN even/RB the/DT police/NN surgeon/NN ca/MD n't/RB stomach/VB ./. i/PRP do/VBP n't/RB think/VB anyone/NN needs/NNS to/TO be/VB briefed/VBN on/IN jack/NN the/DT ripper/NN ,/, so/IN i/PRP wo/MD n't/RB go/VB into/IN the/DT particulars/NNS here/RB ,/, other/JJ than/IN to/TO say/VB moore/NN and/CC campbell/NN have/VBP a/DT unique/JJ and/CC interesting/JJ theory/NN about/IN both/DT the/DT identity/NN of/IN the/DT killer/NN and/CC the/DT reasons/NNS he/PRP chooses/VBZ to/TO slay/VB ./. in/IN the/DT comic/JJ ,/, they/PRP do/VBP n't/RB bother/VB cloaking/VBG the/DT identity/NN of/IN the/DT ripper/NN ,/, but/CC screenwriters/NNS terry/NN hayes/NNS (/-LRB- vertical/JJ limit/NN )/-RRB- and/CC rafael/JJ yglesias/NNS (/-LRB- les/NNS mis/NN ?/. rables/NNS )/-RRB- do/VBP a/DT good/JJ job/NN of/IN keeping/VBG him/PRP hidden/VBN from/IN viewers/NNS until/IN the/DT very/JJ end/NN ./. it/PRP 's/VBZ funny/JJ to/TO watch/VB the/DT locals/NNS blindly/RB point/VBP the/DT finger/NN of/IN blame/NN at/IN jews/NNS and/CC indians/NNS because/IN ,/, after/IN all/DT ,/, an/DT englishman/NN could/MD never/RB be/VB capable/JJ of/IN committing/VBG such/JJ ghastly/JJ acts/NNS ./. and/CC from/IN hell/NN 's/POS ending/NN had/VBD me/PRP whistling/VBG the/DT stonecutters/NNS song/NN from/IN the/DT simpsons/NNS for/IN days/NNS (/-LRB- \"/'' who/WP holds/VBZ back/RB the/DT electric/JJ car/who/NN made/VBD steve/JJ guttenberg/NN a/DT star/NN ?/. \"/`` )/-RRB- ./. do/VBP n't/RB worry/VB -/: it/PRP 'll/MD all/DT make/VB sense/NN when/WRB you/PRP see/VBP it/PRP ./. now/RB onto/IN from/IN hell/NN 's/POS appearance/NN :/: it/PRP 's/VBZ certainly/RB dark/JJ and/CC bleak/JJ enough/JJ ,/, and/CC it/PRP 's/VBZ surprising/JJ to/TO see/VB how/WRB much/RB more/JJR it/PRP looks/VBZ like/IN a/DT tim/JJ burton/NN film/NN than/IN planet/NN of/IN the/DT apes/NNS did/VBD (/-LRB- at/IN times/NNS ,/, it/PRP seems/VBZ like/IN sleepy/JJ hollow/JJ 2/CD )/-RRB- ./. the/DT print/NN i/NN saw/VBD was/VBD n't/RB completely/RB finished/VBN (/-LRB- both/DT color/NN and/CC music/NN had/VBD not/RB been/VBN finalized/VBN ,/, so/IN no/DT comments/NNS about/IN marilyn/JJ manson/NN )/-RRB- ,/, but/CC cinematographer/NN peter/NN deming/NN (/-LRB- do/VBP n't/RB say/VB a/DT word/NN )/-RRB- ably/RB captures/VBZ the/DT dreariness/NN of/IN victorian-era/NN london/RB and/CC helped/VBD make/VB the/DT flashy/JJ killing/NN scenes/NNS remind/VBD me/PRP of/IN the/DT crazy/JJ flashbacks/NNS in/IN twin/JJ peaks/NNS ,/, even/RB though/IN the/DT violence/NN in/IN the/DT film/NN pales/NNS in/IN comparison/NN to/TO that/DT in/IN the/DT black-and-white/JJ comic/JJ ./. oscar/NN winner/NN martin/VBG childs/NNS '/POS (/-LRB- shakespeare/NN in/IN love/NN )/-RRB- production/NN design/NN turns/VBZ the/DT original/JJ prague/NN surroundings/NNS into/IN one/CD creepy/JJ place/NN ./. even/RB the/DT acting/VBG in/IN from/IN hell/NN is/VBZ solid/JJ ,/, with/IN the/DT dreamy/JJ depp/NN turning/VBG in/IN a/DT typically/RB strong/JJ performance/NN and/CC deftly/RB handling/VBG a/DT british/JJ accent/NN ./. ians/NNS holm/VBP (/-LRB- joe/NN gould/NN 's/POS secret/NN )/-RRB- and/CC richardson/NN (/-LRB- 102/CD dalmatians/NNS )/-RRB- log/VBP in/IN great/JJ supporting/VBG roles/NNS ,/, but/CC the/DT big/JJ surprise/NN here/RB is/VBZ graham/NN ./. i/NN cringed/VBD the/DT first/JJ time/NN she/PRP opened/VBD her/PRP$ mouth/NN ,/, imagining/VBG her/PRP$ attempt/NN at/IN an/DT irish/JJ accent/NN ,/, but/CC it/PRP actually/RB was/VBD n't/RB half/DT bad/JJ ./. the/DT film/NN ,/, however/RB ,/, is/VBZ all/DT good/JJ ./. 2/CD :/: 00/CD -/: r/NN for/IN strong/JJ violence/gore/NN ,/, sexuality/NN ,/, language/NN and/CC drug/NN content/NN"

Patterns —-

Given patterns

patterns <- "created/VBN by/IN ([A-z]*)/NN ([A-z]*)/NN"
patterns <- c(patterns, "created/VBN by/IN [A-z]*/NN [A-z]*/NN \\(/-LRB- and/CC ([A-z]*)/JJ ([A-z]*)/NN")
patterns <- c(patterns, "screenwriter[s]?/NN[S]? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)")
patterns <- c(patterns, "cinematographer/NN(?: ,/,)? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/NN[S]?")
patterns <- c(patterns, "cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/IN ([A-z]*)/NN")
patterns <- c(patterns, "oscar/NN winner/NN ([A-z]*)/VBG ([A-z]*)/NNS")

Extension patterns

patterns <- c(patterns, "screenwriters/NNS [A-z]*/(?:NN[S]?|JJ) [A-z]*/(?:NN[S]?|JJ) \\(/-LRB- [A-z /]* \\)/-RRB- and/CC ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN|JJ)") # catch second screen writer
patterns <- c(patterns, "([A-z]*)/NN ([A-z]*)/NN is/VBZ [A-z]*/VBG") ## name surnname is verb-ing ....
patterns <- c(patterns, "([A-z]*)/NN ([A-z]*)/NN and/CC [A-z]*/NN[S]?") # name surname and name
patterns <- c(patterns, "([A-z]*)/NN and/CC [A-z]*/JJ [A-z]*/NN[S]?") # name and name surname

Patterns after analyzing the files randomly

patterns <- c(patterns, "a/DT ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ) film/NN") # a name surname film
patterns <- c(patterns, "to/TO say/VB ([A-z]*)/NN and/CC [A-z]*/NN") # to say (name) and name
patterns <- c(patterns, "to/TO say/VB [A-z]*/NN and/CC ([A-z]*)/NN") # to say name and (name)
patterns <- c(patterns, "named/VBN ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)") # named name surname
patterns <- c(patterns, "about/IN ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)")  # about name surname
patterns <- c(patterns, "made/VBD ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ) a/DT star/NN")  # about name surname

Most significant patterns to increase recall

patterns <- c(patterns, "([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ|VBP) \\(/-LRB- [A-z]*/(?:NN[S]?|JJ) [A-z]*/(?:NN[S]?|JJ)") # name surname (name surname)
patterns <- c(patterns, "\\(/-LRB- ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)")  # (name surname)

Patterns that did not work - Generated a lot of FP

# patterns <- c(patterns, "[A-z]*/NN and/CC ([A-z]*)/NN") name and name
# patterns <- c(patterns, "([A-z]*)/NN ([A-z]*)/NN is|was/VBZ|VBD [A-z]*/VBG") # name surname is|was verb-ing

Patterns detection —-

found_entities <- detect_patterns_in_tagged_corpus(corpus, corpus_tagged_text, patterns)

Evaluation —-

Write results file in a csv

write.table(found_entities, file = "results/found_entities.csv", row.names = F, na = "", sep = ";")

Compare with a gold standard

matches_list <- merge_all_matches_in_lists(found_entities)
head(found_entities)
##              File created/VBN by/IN ([A-z]*)/NN ([A-z]*)/NN
## 1 cv000_29590.txt                                alan moore
## 2 cv001_18431.txt                                      <NA>
## 3 cv002_15918.txt                                      <NA>
## 4 cv003_11664.txt                                      <NA>
## 5 cv004_11636.txt                                      <NA>
## 6 cv005_29443.txt                                      <NA>
##   created/VBN by/IN [A-z]*/NN [A-z]*/NN \\(/-LRB- and/CC ([A-z]*)/JJ ([A-z]*)/NN
## 1                                                                 eddie campbell
## 2                                                                           <NA>
## 3                                                                           <NA>
## 4                                                                           <NA>
## 5                                                                           <NA>
## 6                                                                           <NA>
##   screenwriter[s]?/NN[S]? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)
## 1                                                           terry hayes
## 2                                                                  <NA>
## 3                                                                  <NA>
## 4                                                                  <NA>
## 5                                                                  <NA>
## 6                                                                  <NA>
##   cinematographer/NN(?: ,/,)? ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/NN[S]?
## 1                                                       peter deming
## 2                                                               <NA>
## 3                                                               <NA>
## 4                                                               <NA>
## 5                                                               <NA>
## 6                                                               <NA>
##   cinematographer/NN(?: ,/,)? ([A-z]*)/NN ([A-z]*)/IN ([A-z]*)/NN
## 1                                                              NA
## 2                                                              NA
## 3                                                              NA
## 4                                                              NA
## 5                                                              NA
## 6                                                              NA
##   oscar/NN winner/NN ([A-z]*)/VBG ([A-z]*)/NNS
## 1                                martin childs
## 2                                         <NA>
## 3                                         <NA>
## 4                                         <NA>
## 5                                         <NA>
## 6                                         <NA>
##   screenwriters/NNS [A-z]*/(?:NN[S]?|JJ) [A-z]*/(?:NN[S]?|JJ) \\(/-LRB- [A-z /]* \\)/-RRB- and/CC ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN|JJ)
## 1                                                                                                                           rafael yglesias
## 2                                                                                                                                      <NA>
## 3                                                                                                                                      <NA>
## 4                                                                                                                                      <NA>
## 5                                                                                                                                      <NA>
## 6                                                                                                                                      <NA>
##   ([A-z]*)/NN ([A-z]*)/NN is/VBZ [A-z]*/VBG
## 1                           michael jackson
## 2                                      <NA>
## 3                                      <NA>
## 4                                      <NA>
## 5                                      <NA>
## 6                                      <NA>
##   ([A-z]*)/NN ([A-z]*)/NN and/CC [A-z]*/NN[S]?
## 1                                         <NA>
## 2                            matthew broderick
## 3                                   book chain
## 4                                         <NA>
## 5                                    bruce lee
## 6                                         <NA>
##   ([A-z]*)/NN and/CC [A-z]*/JJ [A-z]*/NN[S]?
## 1                                     albert
## 2                                       <NA>
## 3                                       <NA>
## 4                                       <NA>
## 5                                       <NA>
## 6                                       <NA>
##   a/DT ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ) film/NN
## 1                                                 tim burton
## 2                                                       <NA>
## 3                                                       <NA>
## 4                                                       <NA>
## 5                                                       <NA>
## 6                                                       <NA>
##   to/TO say/VB ([A-z]*)/NN and/CC [A-z]*/NN
## 1                              moore, moore
## 2                                      <NA>
## 3                                      <NA>
## 4                                      <NA>
## 5                                      <NA>
## 6                                      <NA>
##   to/TO say/VB [A-z]*/NN and/CC ([A-z]*)/NN
## 1                        campbell, campbell
## 2                                      <NA>
## 3                                      <NA>
## 4                                      <NA>
## 5                                      <NA>
## 6                                      <NA>
##   named/VBN ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)
## 1                                              mary kelly
## 2                                                    <NA>
## 3                                                    <NA>
## 4                                                    <NA>
## 5                                                    <NA>
## 6                                                    <NA>
##   about/IN ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)
## 1                                         marilyn manson
## 2                                                   <NA>
## 3                                                   <NA>
## 4                                                   <NA>
## 5                                                   <NA>
## 6                                                   <NA>
##   made/VBD ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ) a/DT star/NN
## 1                                                    steve guttenberg
## 2                                                                <NA>
## 3                                                                <NA>
## 4                                                                <NA>
## 5                                                                <NA>
## 6                                                                <NA>
##    ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ|VBP) \\(/-LRB- [A-z]*/(?:NN[S]?|JJ) [A-z]*/(?:NN[S]?|JJ)
## 1 arthouse crowd, peter godley, frederick abberline, mary kelly, terry hayes, rafael yglesias, ians holm
## 2                 tracy flick, tracy flick, max fischer, _election _, _rushmore _, individual screenplay
## 3                                                                                                   <NA>
## 4                                                                martin brody, larry vaughn, matt hooper
## 5                                                                                                   <NA>
## 6                                                                                         moise tschombe
##                                                                    \\(/-LRB- ([A-z]*)/(?:NN[S]?|JJ) ([A-z]*)/(?:NN[S]?|JJ)
## 1                            ghost world, robbie coltrane, johnny depp, heather graham, vertical limit, les mis, joe gould
## 2 reese witherspoon, matthew broderick, _election _, _rushmore _, matthew broderick, bill murray, _rushmore _, _election _
## 3                                                                                                                     <NA>
## 4                                                             roy scheider, murray hamilton, richard dreyfuss, robert shaw
## 5                                                                                                                     <NA>
## 6                                                                                 eriq ebouaney, maka kotto, pascal nzonzi

Load the gold standard and put all gold standard matches in a list for comparison.

gold_standard <- read.table(file = "../data/goldStandard.csv", quote = "", na.strings = "",
                            colClasses = "character", sep = ";")
gold_standard_matches_list <- merge_gold_standard_in_lists(gold_standard)
head(gold_standard_matches_list)
##             Files
## 1 cv000_29590.txt
## 2 cv001_18431.txt
## 3 cv002_15918.txt
## 4 cv003_11664.txt
## 5 cv004_11636.txt
## 6 cv005_29443.txt
##                                                                                                                                                                                                                                                                                                                                                     Matches
## 1 alan moore, eddie campbell, moore, campbell, jack, michael jackson, albert, allen hughes, peter godley, robbie coltrane, frederick abberline, johnny depp, abberline, mary kelly, heather graham, terry hayes, rafael yglesias, steve guttenberg, tim burton, marilyn manson, peter deming, martin childs, depp, ians holm, joe gould, richardson, graham
## 2                                                                                                                                                 matthew broderick, reese witherspoon, george washington carver, tracy flick, paul, max fischer, bill murray, broderick, witherspoon, jessica campbell, tammy, rooney, campbell, alexander payne, tracy, m
## 3                                                                                                                                                                                                                                                                                   ryan, hanks, tom hanks, joe fox, meg ryan, kathleen kelley, fox, kelley
## 4                                                                          john williams, steven spielberg, spielberg, williams, martin brody, roy scheider, larry vaughn, murray hamilton, brody, matt hooper, richard dreyfuss, hooper, vaughn, quint, robert shaw, hitchcock, scheider, dreyfuss, shaw, robert redford, paul newman, duddy kravitz, ahab
## 5                                                                                                                                                                                                                    herb, jackie chan, barry sanders, sanders, jackie, chan, bruce lee, tim allen, lawrence kazdan, john williams, spielberg, george lucas
## 6                                                                                                                                                                                             raoul peck, lumumba, patrice lumumba, eriq ebouaney, helmer peck, peck, pascal bonitzer, patrice, joseph kasa vubu, maka kotto, moise tschombe, pascal nzonzi

Show lists for first file

## MY MATCHES
print(unlist(matches_list$Matches[[2]]))
## [1] "matthew broderick"     "tracy flick"           "max fischer"          
## [4] "_election _"           "_rushmore _"           "individual screenplay"
## [7] "reese witherspoon"     "bill murray"
## GOLD STANDARD MATCHES
print(unlist(gold_standard_matches_list$Matches[[2]]))
##                         V2                         V3 
##        "matthew broderick"        "reese witherspoon" 
##                         V4                         V5 
## "george washington carver"              "tracy flick" 
##                         V6                         V7 
##                     "paul"              "max fischer" 
##                         V8                         V9 
##              "bill murray"                "broderick" 
##                        V10                        V11 
##              "witherspoon"         "jessica campbell" 
##                        V12                        V13 
##                    "tammy"                   "rooney" 
##                        V14                        V15 
##                 "campbell"          "alexander payne" 
##                        V16                        V17 
##                    "tracy"                        "m"

Final results

metrics <- calculate_metrics(matches_list, gold_standard_matches_list)

## Show metrics
metrics
##   Precision    Recall  Fmeasure
## 1  0.826087 0.3877551 0.5277778

Conclusion

In this exercise I tried to obtain a higher recall by sacrificing a bit of accuracy, obtaining a great recall of around 0.4 in the files I tested the patterns on and an accuracy of 0.8. The great result is that the F measure is very high, which indicates a good performance.

In order to do so, I did a research in random files to study where the names usually appeared. I found out one of the most common places was between parenthesis, in order to explain a reference. Hence, that is where I found the most significant patterns to increase the recall.

I would have liked to have a more powerful PC to run the metrics over the whole corpus, but I could only select a reduced dataset, as my computer had java memory issues otherwise.