-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.rb
363 lines (310 loc) · 11.1 KB
/
app.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
require 'rubygems'
require 'sinatra'
require 'mongo_mapper'
require 'twitter'
require 'hpricot'
require 'haml'
require 'open-uri'
#gems for api services for the crawler
require 'youtube_it'
#https://github.com/matthooks/vimeo
require 'vimeo'
#https://github.com/meltingice/ruby-twitpic
#require 'twitpic-full'
require 'config.rb'
#https://github.com/Instagram/instagram-ruby-gem
#require 'instagram'
#https://github.com/ctagg/flickr
#subclasses
require 'tweetstache/expand_url.rb'
require 'tweetstache/mosaic.rb'
require 'tweetstache/terms.rb'
#mongo
MongoMapper::connection = Mongo::Connection.new(@db_server)
MongoMapper::database = @db_name
#mongodb collection classes
#these are tweets that are intentionally saved for republishing
class Tweet
include MongoMapper::Document
end
#these are all tweets crawled
class CrawledTweet
include MongoMapper::Document
end
class Term
include MongoMapper::Document
timestamps!
end
class BlockedUser
include MongoMapper::Document
end
#really this needs to be migrated to videos
class Youtube
include MongoMapper::Document
end
class Video
include MongoMapper::Document
end
#get each tweet for services twitpic, via.me,
#lambda for each service to scrape or api
#CrawledTweet.first({:conditions=>{["entities.urls.0.expanded_url"]=>{'$exists'=>true},["entities.urls.0.expanded_url"]=>/#{service}/}})
Twitter.configure do |config|
config.consumer_key = @twitter_consumer
config.consumer_secret = @twitter_consumer_secret
config.oauth_token = @twitter_oauth_token
config.oauth_token_secret = @twitter_oauth_secret
end
get '/' do
#index of tweets
@tweets = Tweet.all.reverse
haml :index
end
#display the save tweet form
get '/save' do
@tweets = Tweet.all.reverse
haml :save
end
#fetch one tweet from a twitter url, get the json, save as json
post '/save' do
#parse the id string out of the url
#fetch the tweet and save it to mongo cache
#is it twitter? (is twitter.com in the url?)
#save some meta data
#client ip, originating site, meta tags, cache timestamp, processed timestamp
if (params[:url].split("twitter.com").size >1)
#ugly split
id = params[:url].split("twitter.com")[1].split("/")[4]
a_tweet = Twitter.status(id).attrs
Tweet.collection.update({:id_str=>a_tweet["id_str"].to_s},a_tweet, {:upsert => true})
#now view the tweet
@tweet = a_tweet["text"]
else
#if not, parse what we can with hpricot and just save the whole page
html = ""
open(params[:url]) {|f|
f.each_line {|line| html << line}
}
@html = Hpricot(html)
title = (@html/"title")[0].inner_html
Tweet.collection.update({:url=>params[:url]}, {:html=>html, :url=>params[:url],:title=>title}, {:upsert => true})
@tweet = title
end
haml :save
end
#gateway to return tweet json to ushahidi form
get '/tweet/*' do
response['Access-Control-Allow-Origin'] = '*'
params[:url] = params[:splat].join("/")
#yes, it's a tweet, grab the json, parse it, echo it and save it
if (params[:url].split("youtube.com").size > 1)
end
end
#crawl a tweet search term
#m1gs, m1nyc, #baym1gs, #888turk
#chicago, robeson school,
#between may 1 and may 2
#anything from mapreport, mapreport2 mapreport3
#philly, portland, chicago, sf, boston
get '/crawl' do
#fitler out retweets
@terms = Term.all({:conditions=>{:is_active=>'yes'},:order=>:last_checked.desc})
puts @terms.inspect
@blocked = BlockedUser.all
@block = {}
sleep 1
@blocked.each do |block|
@block[block["user_id"]] = block["user_id"]
end
@time = ['2012-05-02','2012-05-03','2012-05-04']
@terms.each do |term|
#get an id from may 1, early
#197149879875813377
@time.each do |date_until|
puts term.inspect
#find one term to get max id
max = CrawledTweet.all({:conditions=>{:timestamp=>{:$lte=>Time.parse(date_until).to_i},:text=>Regexp.new(term.term)},:limit=>1, :order=>:timestamp.asc})
puts max.inspect
if (max[0] !=nil )
max_id = {:max_id=>max[0].id_str}
else
max_id = {}
end
puts max_id.inspect
puts date_until
15.times do |p|
begin
#m1gs since_id:196982181401341952 until:2012-05-03
#max id
tweets = Twitter.search(term.term.to_s + " -rt -facials -amateur",{:rpp=>100, :page => (p+1).to_i,:since_id =>196982181401341952, :until=>date_until,:include_entities=>1}.merge(max_id))
rescue Twitter::Error::BadGateway
rescue Twitter::Error::Forbidden
rescue NoMethodError
puts "bad gateway"
sleep 120
tweets = Twitter.search(term.term.to_s + " -rt -facials -amateur",{:rpp=>100, :page => (p+1).to_i,:since_id =>196982181401341952, :until=>date_until,:include_entities=>1}.merge(max_id)) end
begin
puts tweets.size
rescue NoMethodError
tweets = []
end
tweets.each do | a_tweet |
begin
a_tweet.attrs["timestamp"] = Time.parse(a_tweet.attrs["created_at"]).to_i
rescue NoMethodError
a_tweet.attrs["timestamp"] = 1
end
#extract vids for embed code
if a_tweet.attrs["entities"]
if a_tweet.attrs["entities"]["urls"] !=nil
a_tweet.attrs["entities"]["urls"].each do |url|
begin
url["expanded_url"].expand_urls!
rescue NoMethodError
url["expanded_url"] = ""
end
if url["expanded_url"].split("youtube.com").size >1 || url["expanded_url"].split("youtu.be").size > 1
client = YouTubeIt::Client.new(:dev_key => @devkey)
begin
vid = client.video_by(url["expanded_url"])
a_tweet.attrs["video_embed"] = vid.embed_html
rescue OpenURI::HTTPError => e
end
elsif (url["expanded_url"].split("vimeo.com").size > 1)
video_id = url["expanded_url"].split("/").last
vid = Vimeo::Simple::Video.info(video_id)
a_tweet.attrs["video_embed"] = '<iframe src="http://player.vimeo.com/video/#{vid.id}" width="500" height="313" frameborder="0" webkitAllowFullScreen mozallowfullscreen allowFullScreen></iframe>'
elsif (url["expanded_url"].split("ht.ly").size > 1)
a_tweet.attrs["block"] =1
elsif (url["expanded_url"]).split("instagr.am").size > 1
begin OpenURI::HTTPError
#add the media link
html = ""
open(url["expanded_url"]) {|f|
f.each_line {|line| html << line}
}
@html = Hpricot(html)
a_tweet.attrs["entites.media.0.media_url"] =(@html/"img.photo")[0][:src]
a_tweet.attrs["entities"]["media"] = [:expanded_url=> (@html/"img.photo")[0][:src],:size=>{:small=>{:h=>320}}]
rescue
end
end
#expanded url for twitpic
#http://instagr.am/
#yfrog
#via.me
#lockerz
end
end
end
if @block[a_tweet.attrs["from_user_id"].to_s] !=nil
a_tweet.attrs["block"] = 1
end
begin
a_tweet.attrs['id'] = nil
a_tweet. CrawledTweet.collection.update({:id_str=>a_tweet.attrs["id_str"].to_s},a_tweet.attrs, {:upsert => true})
rescue
end
end
sleep 2
end
Term.collection.update({:term=>term.term},{:term=>term.term,:last_checked=>Time.now,:is_active=>term.is_active},{:upsert=>true})
sleep 30
end
end
haml :crawl
end
get "/crawl/tweets/:page/?:media:?" do
if params[:page]==nil
page = 0
else
page = params[:page].to_i
end
if params[:media] !=nil
# 0 = everything
# 1 = videos, no photos
# 2 = photos, no videos
# 3 = photos and videos
filter_media = [{:video_embed=>{'$exists'=>true}},{:image_url=>{'$exists'=>true}}]
if params[:media]==1
filter_media = [{:video_embed=>{'$exists'=>true}}]
else
filter_media = [{}]
end
end
@media = params[:media]
if page > 0
@prev = page -1
end
@next = page + 1
#not blocked users
@tweets = CrawledTweet.all({:conditions=>{:block=>{'$exists'=>false}},:limit=>25, :skip=>25*page,:order=>:timestamp.asc}.merge(filter_media[0]))
haml :tweets
end
#grab a youtube video thru the api and return json of the same, for ajax within occupymap
get '/video/*' do
response['Access-Control-Allow-Origin'] = '*'
params[:url] = params[:splat].join("/")
if (params[:url].split("youtube.com").size > 1)
params[:url] << "?v=" + params[:v]
params[:url].gsub!("http:/","http://")
client = YouTubeIt::Client.new(:dev_key => @devkey)
vid = client.video_by(params[:url])
#buid a json out
json = {:url =>params[:url], :title=>vid.title, :description=>vid.description, :username=>vid.author.name, :date =>vid.published_at}
Youtube.collection.update({:url=>params[:url]},json,{:upsert=>true})
content_type 'application/json'
@json = json.to_json
elsif (params[:url].split("vimeo.com").size > 1)
#parse out the vimeo id
video_id = params[:url].split("/").last
params[:url].gsub!("http:/","http://")
v = Vimeo::Simple::Video.info(video_id).parsed_response[0]
json = {:url =>params[:url], :title=>v["title"], :description=>v["description"],
:username=>v["user_url"], :date =>v["upload_date"]}
Youtube.collection.update({:url=>params[:url]},json,{:upsert=>true})
content_type 'application/json'
@json = json.to_json
else
@json = "could not parse video"
end
haml :video
end
#get one tweet by its id string from the mongo cache, echo it as json
get '/tweets/json/:id' do
response['Access-Control-Allow-Origin'] = '*'
@json = Tweet.first(:id_str=>:id.to_s).to_json
#just echo it as a json string
content_type 'application/json'
haml :view
end
get '/users/block/:user_id' do
#add user to blocks table
#flag all existing tweets and crawled tweets to hide
BlockedUser.collection.update({:user_id=>params[:user_id]},{:user_id=>params[:user_id]},{:upsert=>true})
@tweets = CrawledTweet.all({:conditions=>{:user.id=>params[:user_id]}})
@tweets.each do |tweet|
tweet["block"] = 1
CrawledTweet.collection.update({:id_str=>tweet.attrs["id_str"].to_s},tweet.attrs, {:upsert => true})
end
end
get '/block' do
@tweet = {:text=>""}
haml :block
end
get '/block/:id_str' do
@tweet = CrawledTweet.first({:conditions=>{:id_str=>params[:id_str]}})
@tweet['block'] =1
puts @tweet.inspect
@tweet.save
haml :block
end
#compile videos from crawled tweets
get '/videos/compile?' do
@tweets = CrawledTweets.all({:conditions=>{:video_embed=>{'$exists'=>true}}})
@tweets.each do |tweet|
Video.collection.update({:url=>tweet[]})
end
end
#get videos from the video collection - thumbnail, embed, who tweeted it
get '/videos/index/:page?' do
end