Module senticap_reader
[hide private]
[frames] | no frames]

Source Code for Module senticap_reader

  1  import json 
  2  import sys 
  3  import argparse 
  4   
5 -class SenticapSentence(object):
6 """ 7 Stores details about a sentence. 8 9 @ivar tokens: A tokenized version of the sentence with punctuation removed and 10 words made lower case. 11 @ivar word_sentiment: Indicates which words are part of an Adjective Noun 12 Pair with sentiment; 1 iff the word is part of an ANP with sentiment. 13 @ivar sentiment_polarity: Does this sentence express positive or negative sentiment. 14 @ivar raw_sentence: The caption without any processing; taken directly from MTURK. 15 """ 16 17 NEGATIVE_SENTIMENT = 0 18 POSITIVE_SENTIMENT = 1 19
20 - def __init__(self):
21 self.tokens = [] 22 self.word_sentiment = [] 23 self.sentiment_polarity = [] 24 self.raw_sentence = []
25
26 - def setTokens(self, tokens):
27 assert isinstance(tokens, list) 28 for tok in tokens: 29 assert isinstance(tok, str) or isinstance(tok, unicode) 30 31 self.tokens = tokens
32
33 - def setWordSentiment(self, word_sentiment):
34 assert isinstance(word_sentiment, list) 35 36 self.word_sentiment = [int(s) for s in word_sentiment]
37
38 - def setSentimentPolarity(self, sentiment_polarity):
39 assert sentiment_polarity in [self.NEGATIVE_SENTIMENT, self.POSITIVE_SENTIMENT] 40 41 self.sentiment_polarity = sentiment_polarity
42
43 - def setRawSentence(self, raw_sentence):
44 assert isinstance(raw_sentence, str) or isinstance(raw_sentence, unicode) 45 46 self.raw_sentence = raw_sentence
47
48 - def getTokens(self):
49 return self.tokens
50
51 - def getWordSentiment(self):
52 return self.word_sentiment
53
54 - def getSentimentPolarity(self):
55 return self.sentiment_polarity
56
57 - def getRawsentence(self):
58 return self.raw_sentence
59 60
61 -class SenticapImage(object):
62 """ 63 Stores details about a sentence. 64 65 @ivar filename: The filename of the image in the MSCOCO dataset 66 @ivar imgid: A unique but arbritrary number assigned to each image. 67 @ivar sentences: A list of sentences corresponding to this image of 68 type `SenticapSentence`. 69 @ivar split: Indicates if this is part of the TEST, TRAIN or VAL split. 70 71 """ 72 TEST_SPLIT = 0 73 TRAIN_SPLIT = 1 74 VAL_SPLIT = 2 75 76
77 - def __init__(self):
78 self.filename = "" 79 self.imgid = None 80 self.sentences = [] 81 self.split = None
82
83 - def setFilename(self, filename):
84 assert isinstance(filename, str) or isinstance(filename, unicode) 85 self.filename = filename
86
87 - def setImgID(self, imgid):
88 self.imgid = imgid
89
90 - def addSentence(self, sentence):
91 assert isinstance(sentence, SenticapSentence) 92 self.sentences.append(sentence)
93
94 - def setSplit(self, split):
95 assert split in [self.TEST_SPLIT, self.TRAIN_SPLIT, self.VAL_SPLIT] 96 self.split = split
97
98 - def getFilename(self):
99 return self.filename
100
101 - def getImgID(self):
102 return self.imgid
103
104 - def getSentences(self):
105 return self.sentences
106
107 - def getSplit(self):
108 return self.split
109 110
111 -class SenticapReader(object):
112 """Handles the reading of the senticap dataset. 113 Has functions to write examples to a simple csv format, 114 and to count the number of examples. 115 """ 116 117 images = [] 118
119 - def __init__(self, filename):
120 """ 121 Initializer that reads a senticap json file 122 123 @param filename: the file path of the json file 124 """ 125 self.readJson(filename) 126 self.filename = filename
127
128 - def readJson(self, filename):
129 """ 130 Read a senticap json file and load it into `SenticapImage` and 131 `SenticapSentence` classes. The result is saved in `self.images`. 132 133 @param filename: the file path of the json file 134 """ 135 136 data = json.load(open(filename, "r")) 137 for image in data["images"]: 138 139 #create the SenticapImage entry 140 im = SenticapImage() 141 im.setFilename(image["filename"]) 142 if image["split"] == "train": 143 im.setSplit(im.TRAIN_SPLIT) 144 elif image["split"] == "test": 145 im.setSplit(im.TEST_SPLIT) 146 elif image["split"] == "val": 147 im.setSplit(im.VAL_SPLIT) 148 im.setImgID(image["imgid"]) 149 150 #for this image create all the SenticapSentence entries 151 for sent in image["sentences"]: 152 se = SenticapSentence() 153 se.setTokens(sent["tokens"]) 154 se.setWordSentiment(sent["word_sentiment"]) 155 if sent["sentiment"] == 0: 156 se.setSentimentPolarity(se.NEGATIVE_SENTIMENT) 157 else: 158 se.setSentimentPolarity(se.POSITIVE_SENTIMENT) 159 se.setRawSentence(sent["raw"]) 160 im.addSentence(se) 161 162 self.images.append(im)
163
164 - def writeCSV(self, output_filename, train=True, test=True, val=True, pos=True, neg=True):
165 """ 166 Write a CSV file from the examples matching the filter criteria. The 167 columns of the csv are (filename, is_positive_sentiment, caption). 168 where: 169 - B{filename:} is the filename of the MSCOCO image 170 - B{is_positive_sentiment:} is 1 if the sentence expresses 171 positive sentiment 0 if the sentence expresses 172 negative sentiment 173 - B{caption:} is the tokenized, lowercase, 174 punctuation removed sentence joined with space 175 characters 176 177 @param output_filename: path of csv to write 178 @param test: include testing examples 179 @param val: include validation examples 180 @param pos: include positive sentiment examples 181 @param neg: include negative sentiment examples 182 """ 183 fout = open(output_filename, "w") 184 fout.write("filename,is_positive_sentiment,caption\n") 185 for im in self.images: 186 if im.getSplit() == im.TEST_SPLIT and not test: 187 continue 188 if im.getSplit() == im.TRAIN_SPLIT and not train: 189 continue 190 if im.getSplit() == im.VAL_SPLIT and not val: 191 continue 192 sentences = im.getSentences() 193 for sent in sentences: 194 if sent.getSentimentPolarity() == sent.NEGATIVE_SENTIMENT and not neg: 195 continue 196 if sent.getSentimentPolarity() == sent.POSITIVE_SENTIMENT and not pos: 197 continue 198 fout.write('%s,%d,"%s"\n' % (im.getFilename(), 199 sent.getSentimentPolarity()==sent.POSITIVE_SENTIMENT, 200 ' '.join(sent.getTokens()))) 201 fout.close()
202
203 - def countExamples(self, train=True, test=True, val=True, pos=True, neg=True):
204 """ 205 Count the number of examples matching the filter criteria 206 207 @param train: include training examples 208 @param test: include testing examples 209 @param val: include validation examples 210 @param pos: include positive sentiment examples 211 @param neg: include negative sentiment examples 212 @return: a tuple giving the number of images with sentences and the 213 total number of sentences 214 @rtype: `tuple(int, int)` 215 """ 216 num_sentence = 0 217 num_image_with_sentence = 0 218 for im in self.images: 219 if im.getSplit() == im.TEST_SPLIT and not test: 220 continue 221 if im.getSplit() == im.TRAIN_SPLIT and not train: 222 continue 223 if im.getSplit() == im.VAL_SPLIT and not val: 224 continue 225 226 image_has_sentence = False 227 sentences = im.getSentences() 228 for sent in sentences: 229 if sent.getSentimentPolarity() == sent.NEGATIVE_SENTIMENT and not neg: 230 continue 231 if sent.getSentimentPolarity() == sent.POSITIVE_SENTIMENT and not pos: 232 continue 233 num_sentence += 1 234 image_has_sentence = True 235 if image_has_sentence: 236 num_image_with_sentence += 1 237 238 return (num_image_with_sentence, num_sentence)
239 240
241 -def main():
242 243 #handle arguments 244 ap = argparse.ArgumentParser() 245 ap.add_argument("--filename", "-f", default="./data/senticap_dataset.json", 246 help = "Path to the senticap json") 247 ap.add_argument("--csv_output", "-o", help = "Where to write the csv file.") 248 ap.add_argument("--train", action="store_true", help = "Include the training examples") 249 ap.add_argument("--test", action="store_true", help = "Include the testing examples") 250 ap.add_argument("--val", action="store_true", help = "Include the validation examples") 251 ap.add_argument("--pos", action="store_true", 252 help = "Include the positive sentiment examples") 253 ap.add_argument("--neg", action="store_true", 254 help = "Include the negative sentiment examples") 255 args = ap.parse_args() 256 257 sr = SenticapReader(args.filename) 258 if args.csv_output: 259 sr.writeCSV(args.csv_output, train=args.train, test=args.test, val=args.val) 260 else: 261 count = sr.countExamples(train=args.train, test=args.test, val=args.val, 262 pos=args.pos, neg=args.neg) 263 print "Input Filename:", args.filename 264 print "Filters:", 265 if args.train: 266 print "Train", 267 if args.test: 268 print "Test", 269 if args.val: 270 print "Val", 271 if args.pos: 272 print "Positive", 273 if args.neg: 274 print "Negative", 275 print "\n" 276 print "Number of images: %d\nNumber of Sentences: %d" % count
277 278 279 if __name__ == "__main__": 280 main() 281