1 import json
2 import sys
3 import argparse
4
6 """
7 Stores details about a sentence.
8
9 @ivar tokens: A tokenized version of the sentence with punctuation removed and
10 words made lower case.
11 @ivar word_sentiment: Indicates which words are part of an Adjective Noun
12 Pair with sentiment; 1 iff the word is part of an ANP with sentiment.
13 @ivar sentiment_polarity: Does this sentence express positive or negative sentiment.
14 @ivar raw_sentence: The caption without any processing; taken directly from MTURK.
15 """
16
17 NEGATIVE_SENTIMENT = 0
18 POSITIVE_SENTIMENT = 1
19
21 self.tokens = []
22 self.word_sentiment = []
23 self.sentiment_polarity = []
24 self.raw_sentence = []
25
27 assert isinstance(tokens, list)
28 for tok in tokens:
29 assert isinstance(tok, str) or isinstance(tok, unicode)
30
31 self.tokens = tokens
32
34 assert isinstance(word_sentiment, list)
35
36 self.word_sentiment = [int(s) for s in word_sentiment]
37
42
44 assert isinstance(raw_sentence, str) or isinstance(raw_sentence, unicode)
45
46 self.raw_sentence = raw_sentence
47
50
52 return self.word_sentiment
53
55 return self.sentiment_polarity
56
58 return self.raw_sentence
59
60
62 """
63 Stores details about a sentence.
64
65 @ivar filename: The filename of the image in the MSCOCO dataset
66 @ivar imgid: A unique but arbritrary number assigned to each image.
67 @ivar sentences: A list of sentences corresponding to this image of
68 type `SenticapSentence`.
69 @ivar split: Indicates if this is part of the TEST, TRAIN or VAL split.
70
71 """
72 TEST_SPLIT = 0
73 TRAIN_SPLIT = 1
74 VAL_SPLIT = 2
75
76
78 self.filename = ""
79 self.imgid = None
80 self.sentences = []
81 self.split = None
82
84 assert isinstance(filename, str) or isinstance(filename, unicode)
85 self.filename = filename
86
89
91 assert isinstance(sentence, SenticapSentence)
92 self.sentences.append(sentence)
93
97
100
103
105 return self.sentences
106
109
110
112 """Handles the reading of the senticap dataset.
113 Has functions to write examples to a simple csv format,
114 and to count the number of examples.
115 """
116
117 images = []
118
120 """
121 Initializer that reads a senticap json file
122
123 @param filename: the file path of the json file
124 """
125 self.readJson(filename)
126 self.filename = filename
127
163
164 - def writeCSV(self, output_filename, train=True, test=True, val=True, pos=True, neg=True):
165 """
166 Write a CSV file from the examples matching the filter criteria. The
167 columns of the csv are (filename, is_positive_sentiment, caption).
168 where:
169 - B{filename:} is the filename of the MSCOCO image
170 - B{is_positive_sentiment:} is 1 if the sentence expresses
171 positive sentiment 0 if the sentence expresses
172 negative sentiment
173 - B{caption:} is the tokenized, lowercase,
174 punctuation removed sentence joined with space
175 characters
176
177 @param output_filename: path of csv to write
178 @param test: include testing examples
179 @param val: include validation examples
180 @param pos: include positive sentiment examples
181 @param neg: include negative sentiment examples
182 """
183 fout = open(output_filename, "w")
184 fout.write("filename,is_positive_sentiment,caption\n")
185 for im in self.images:
186 if im.getSplit() == im.TEST_SPLIT and not test:
187 continue
188 if im.getSplit() == im.TRAIN_SPLIT and not train:
189 continue
190 if im.getSplit() == im.VAL_SPLIT and not val:
191 continue
192 sentences = im.getSentences()
193 for sent in sentences:
194 if sent.getSentimentPolarity() == sent.NEGATIVE_SENTIMENT and not neg:
195 continue
196 if sent.getSentimentPolarity() == sent.POSITIVE_SENTIMENT and not pos:
197 continue
198 fout.write('%s,%d,"%s"\n' % (im.getFilename(),
199 sent.getSentimentPolarity()==sent.POSITIVE_SENTIMENT,
200 ' '.join(sent.getTokens())))
201 fout.close()
202
203 - def countExamples(self, train=True, test=True, val=True, pos=True, neg=True):
204 """
205 Count the number of examples matching the filter criteria
206
207 @param train: include training examples
208 @param test: include testing examples
209 @param val: include validation examples
210 @param pos: include positive sentiment examples
211 @param neg: include negative sentiment examples
212 @return: a tuple giving the number of images with sentences and the
213 total number of sentences
214 @rtype: `tuple(int, int)`
215 """
216 num_sentence = 0
217 num_image_with_sentence = 0
218 for im in self.images:
219 if im.getSplit() == im.TEST_SPLIT and not test:
220 continue
221 if im.getSplit() == im.TRAIN_SPLIT and not train:
222 continue
223 if im.getSplit() == im.VAL_SPLIT and not val:
224 continue
225
226 image_has_sentence = False
227 sentences = im.getSentences()
228 for sent in sentences:
229 if sent.getSentimentPolarity() == sent.NEGATIVE_SENTIMENT and not neg:
230 continue
231 if sent.getSentimentPolarity() == sent.POSITIVE_SENTIMENT and not pos:
232 continue
233 num_sentence += 1
234 image_has_sentence = True
235 if image_has_sentence:
236 num_image_with_sentence += 1
237
238 return (num_image_with_sentence, num_sentence)
239
240
242
243
244 ap = argparse.ArgumentParser()
245 ap.add_argument("--filename", "-f", default="./data/senticap_dataset.json",
246 help = "Path to the senticap json")
247 ap.add_argument("--csv_output", "-o", help = "Where to write the csv file.")
248 ap.add_argument("--train", action="store_true", help = "Include the training examples")
249 ap.add_argument("--test", action="store_true", help = "Include the testing examples")
250 ap.add_argument("--val", action="store_true", help = "Include the validation examples")
251 ap.add_argument("--pos", action="store_true",
252 help = "Include the positive sentiment examples")
253 ap.add_argument("--neg", action="store_true",
254 help = "Include the negative sentiment examples")
255 args = ap.parse_args()
256
257 sr = SenticapReader(args.filename)
258 if args.csv_output:
259 sr.writeCSV(args.csv_output, train=args.train, test=args.test, val=args.val)
260 else:
261 count = sr.countExamples(train=args.train, test=args.test, val=args.val,
262 pos=args.pos, neg=args.neg)
263 print "Input Filename:", args.filename
264 print "Filters:",
265 if args.train:
266 print "Train",
267 if args.test:
268 print "Test",
269 if args.val:
270 print "Val",
271 if args.pos:
272 print "Positive",
273 if args.neg:
274 print "Negative",
275 print "\n"
276 print "Number of images: %d\nNumber of Sentences: %d" % count
277
278
279 if __name__ == "__main__":
280 main()
281