Commit cd3f2b98c563e1822af27c973935259afaf40b20

Authored by Noah ago
1 parent 6156c772bf
Exists in master

링크 본문 댓글 포함 모든 url 수집

Showing 4 changed files with 128 additions and 77 deletions Side-by-side Diff

insight/crawlShortener.py View file @ cd3f2b9
1   -# -*- coding: utf-8 -*-
2   -
3   -import requests
4   -from base62 import Base62
5   -
6   -class CrawlShortener() :
7   -
8   - def runCrawler(self, links):
9   - data = []
10   -
11   - print links
12   -
13   - try:
14   - for link in links:
15   -
16   - if len(link) < 5:
17   - continue
18   -
19   - link_meta = link + "+"
20   - txt = requests.get(link_meta).text
21   -
22   - source_tag_op = "\"long_url\": \""
23   - source_tag_cl = "\""
24   -
25   - clicks_tag_op = "\"user_clicks\": "
26   - clicks_tag_cl = ","
27   -
28   - source_bgn = txt.find(source_tag_op) + len(source_tag_op)
29   - source_end = source_bgn + txt[source_bgn:(source_bgn + 500)].find(source_tag_cl)
30   - clicks_bgn = txt.find(clicks_tag_op) + len(clicks_tag_op)
31   - clicks_end = clicks_bgn + txt[clicks_bgn:(clicks_bgn + 50)].find(clicks_tag_cl)
32   -
33   - try:
34   - piki_url = str(txt[source_bgn:source_end]).split("cid=")[1].split("&")[0]
35   - except:
36   - piki_url = str(0)
37   -
38   - data.append({
39   - 'bit_url' : str(link),
40   - 'origin_url': str(txt[source_bgn:source_end]),
41   - 'bit_click': str(txt[clicks_bgn:clicks_end]),
42   - 'piki_cid' : str(Base62().decode(piki_url))
43   - })
44   -
45   - except:
46   - data.append({
47   - 'bit_url' : "",
48   - 'origin_url': "",
49   - 'bit_click': "0",
50   - 'piki_cid' : "0"
51   - })
52   -
53   - if len(data) == 0:
54   - data.append({'bit_url' : "",'origin_url': "",'bit_click': "0",'piki_cid' : "0"})
55   -
56   - return data
insight/postinsight.py View file @ cd3f2b9
... ... @@ -3,13 +3,15 @@
3 3  
4 4 import requests
5 5 import json
6   -from util import Util
  6 +from url import Url
7 7  
8 8  
9 9 class PostInsight() :
10 10  
11 11 token = ""
12 12 type = ""
  13 + link = ""
  14 + comment = ""
13 15 data = {}
14 16 insight_map = {}
15 17  
16 18  
... ... @@ -19,12 +21,13 @@
19 21  
20 22 def setContentInsight(self,content_id):
21 23  
22   - url ='https://graph.facebook.com/v2.5/%s?fields=comments.limit(5),type,insights{values}'%content_id
  24 + url ='https://graph.facebook.com/v2.5/%s?fields=link,comments.limit(5),type,insights{values}'%content_id
23 25 txt = requests.get(url + self.token).text
24 26 #print url + self.token
25 27 self.data = json.loads(txt)
26 28 self.setContentType(self.data)
27 29 self.setContentComment(self.data)
  30 + self.setLinkUrl(self.data)
28 31 #print self.data
29 32 try:
30 33 for i in self.data['insights']['data']:
31 34  
... ... @@ -63,11 +66,20 @@
63 66  
64 67 def setContentComment(self, data):
65 68 try:
66   - self.comment = Util().getText2bitly(data['comments']['data'][0]['message'])
67   - print self.comment
  69 + self.comment = Url().getText2bitly(data['comments']['data'][0]['message'])
  70 + #print self.comment
68 71 except:
69 72 self.comment = ""
70 73  
  74 + def setLinkUrl(self, data):
  75 + try:
  76 + self.link = data['link']
  77 + #print self.link
  78 + except:
  79 + self.link = ""
  80 +
  81 + def getLinkUrl(self):
  82 + return self.link
71 83  
72 84 def getContentType(self):
73 85 return self.type
insight/url.py View file @ cd3f2b9
  1 +# -*- coding: utf-8 -*-
  2 +
  3 +import requests
  4 +from base62 import Base62
  5 +
  6 +class Url() :
  7 +
  8 + def url2dic(self, links):
  9 + data = []
  10 +
  11 + try:
  12 + for link in links:
  13 +
  14 +
  15 + if len(link) < 5:
  16 + continue
  17 +
  18 + if (self.isdeep(link) or self.isrpiki(link)) :
  19 +
  20 + data.append({
  21 + 'bitly_url' : "",
  22 + 'origin_url': str(link),
  23 + 'bitly_click': "0",
  24 + 'piki_cid' : str(self.getUrl2Cid(link))
  25 + })
  26 +
  27 + elif self.isbitly(link):
  28 + link_meta = link + "+"
  29 + txt = requests.get(link_meta).text
  30 +
  31 + source_tag_op = "\"long_url\": \""
  32 + source_tag_cl = "\""
  33 +
  34 + clicks_tag_op = "\"user_clicks\": "
  35 + clicks_tag_cl = ","
  36 +
  37 + source_bgn = txt.find(source_tag_op) + len(source_tag_op)
  38 + source_end = source_bgn + txt[source_bgn:(source_bgn + 500)].find(source_tag_cl)
  39 + clicks_bgn = txt.find(clicks_tag_op) + len(clicks_tag_op)
  40 + clicks_end = clicks_bgn + txt[clicks_bgn:(clicks_bgn + 50)].find(clicks_tag_cl)
  41 +
  42 + try:
  43 + piki_url = str(txt[source_bgn:source_end]).split("cid=")[1].split("&")[0]
  44 + except:
  45 + piki_url = str(0)
  46 +
  47 + data.append({
  48 + 'bitly_url' : str(link),
  49 + 'origin_url': str(txt[source_bgn:source_end]),
  50 + 'bitly_click': str(txt[clicks_bgn:clicks_end]),
  51 + 'piki_cid' : str(Base62().decode(piki_url))
  52 + })
  53 +
  54 + except:
  55 + data.append({
  56 + 'bitly_url' : "",
  57 + 'origin_url': "",
  58 + 'bitly_click': "0",
  59 + 'piki_cid' : "0"
  60 + })
  61 +
  62 + if len(data) == 0:
  63 + data.append({'bitly_url' : "",'origin_url': "",'bitly_click': "0",'piki_cid' : "0"})
  64 +
  65 + #print data
  66 + return data
  67 +
  68 +
  69 +
  70 + def getUrl2Cid(self,url):
  71 +
  72 + if self.isrpiki(url):
  73 + return Base62().decode(url.split("cid=")[1].split("&")[0])
  74 + elif self.isdeep(url):
  75 + try:
  76 + return requests.get(url).text.split("http://www.pikicast.com/share/")[1].split('"')[0]
  77 + except :
  78 + return "0"
  79 +
  80 +
  81 + def getText2bitly(self,text):
  82 + return "http://bit.ly/" + text.split("http://bit.ly/")[1].split(" ")[0]
  83 +
  84 +
  85 + def isdeep(self, url):
  86 +
  87 + if url.find("//fb.me/") > 0 :
  88 + return True
  89 + else:
  90 + return False
  91 +
  92 + def isrpiki(self, url):
  93 +
  94 + if url.find("//r.pikicast.com/") > 0 :
  95 + return True
  96 + else:
  97 + return False
  98 +
  99 + def isbitly(self, url):
  100 +
  101 + if url.find("//bit.ly/") > 0 :
  102 + return True
  103 + else:
  104 + return False
... ... @@ -5,16 +5,13 @@
5 5 from insight.token import Token
6 6 from insight.postinsight import PostInsight
7 7 from insight.datadb import DataDB
8   -from insight.util import Util
9   -from insight.crawlShortener import CrawlShortener
  8 +
  9 +from insight.url import Url
10 10 import time
11 11  
12 12  
13 13 if __name__=='__main__':
14 14  
15   - util = Util()
16   -
17   - #token_str = "CAAUTLd5JgaoBABnBUUgOjgaUkuviOZCC1otVIJYwYapCRTZBU8ZAbTOd1uo1SBiqHr1eBhVdzoYDoBjPzqO0ImpxGZAXtlXeVtp4r2muUDt3O53NLYqFgrJqKAowcU7fWMsmMTZAAtKT8MXeF4Lb0rHQ2I1gHmbr6nlJWBHzk6dsNPmYHguMp"
18 15 token_str = "EAAUTLd5JgaoBAJMeeMXqcdExQ1egUHeBaIgVBCilmiH4K9RNyUt7gSgVZCZAtszWCLEaZCDQpxewhICtFjNRICFPWAqUygshcSsdEZBUeZAyUJkON7bfQ2NFFI5AqifNahzjFT83GkWZCZCZBXO3050XSjFf9HSR0iAZD"
19 16 token = Token(token_str)
20 17  
21 18  
... ... @@ -24,9 +21,7 @@
24 21  
25 22 for content in contents_list:
26 23 datadb = DataDB()
27   - #print content
28 24 time.sleep(1)
29   - #print content['id'].split('_')
30 25 p_id = str(content['id'].split('_')[0])
31 26 c_id = str(content['id'].split('_')[1])
32 27 created_time = content['created_time'].replace('T'," ").replace('+0000',"")
33 28  
... ... @@ -37,11 +32,10 @@
37 32 message = ""
38 33  
39 34 try:
40   - message_url = util.getText2bitly(content['message'])
  35 + message_url = Url().getText2bitly(content['message'])
41 36 except:
42 37 message_url = ""
43 38  
44   -
45 39 insight = PostInsight(token)
46 40 insight.setContentInsight(content['id'])
47 41  
48 42  
49 43  
50 44  
... ... @@ -114,18 +108,15 @@
114 108 sqlvalues += "'" + created_time + "', "
115 109 sqlvalues += "NOW(), "
116 110  
117   - bitdata = CrawlShortener().runCrawler([message_url,insight.getContentCommentUrl()])[0]
118   - #print bitdata
  111 + url_data = Url().url2dic([insight.getLinkUrl(),message_url,insight.getContentCommentUrl()])[0]
119 112  
120   - sqlvalues += "'" + bitdata['bit_url'] + "', "
121   - sqlvalues += bitdata['bit_click'] + ", "
122   - sqlvalues += "'" + bitdata['origin_url'] + "', "
123   - sqlvalues += bitdata['piki_cid'] + ", "
  113 + sqlvalues += "'" + url_data['bitly_url'] + "', "
  114 + sqlvalues += url_data['bitly_click'] + ", "
  115 + sqlvalues += "'" + url_data['origin_url'] + "', "
  116 + sqlvalues += url_data['piki_cid'] + ", "
124 117  
125 118 for list in lists:
126   - #print list, insight.getContentInsightByKey(list)
127 119 sqlprefix += "`" + list.replace(',','_') + "`, "
128   - #print list.replace(',','_')
129 120 sqlvalues += str(insight.getContentInsightByKey(list)) + ", "
130 121  
131 122 sqlprefix = sqlprefix[:len(sqlprefix)-2] + ")"