import numpy as np import cv2 import requests from collections import namedtuple from bs4 import BeautifulSoup import imagehash from PIL import Image def itunes_find(content): icon, name = None, None soup = BeautifulSoup(content) found = soup.find(id="title") name = found.div.h1.get_text() found = soup.find('img',{'class':'artwork', 'alt': name}) imageurl = found['src-swap-high-dpi'] icon_r = requests.get(imageurl) if icon_r.status_code == 200: img_array = np.asarray(bytearray(icon_r.content), dtype=np.uint8) icon = cv2.imdecode(img_array, cv2.IMREAD_COLOR) return name, icon def google_find(content): icon, name = None, None soup = BeautifulSoup(content) found = soup.find('div',{'class':'cover-container'}) imageurl = found('img')[0]['src'] icon_r = requests.get(imageurl) if icon_r.status_code == 200: img_array = np.asarray(bytearray(icon_r.content), dtype=np.uint8) icon = cv2.imdecode(img_array, cv2.IMREAD_COLOR) found = soup.find('div',{'class':'document-title'}) if not found: found = soup.find('h1',{'class':'document-title'}) if not found: with open('olala1.html', 'w') as f: f.write(content) name = found.get_text() return name, icon def windows_find(content): icon, name = None, None soup = BeautifulSoup(content) found = soup.find('img', {'class':'appImage xlarge'}) imageurl = found['src'] icon_r = requests.get(imageurl) if icon_r.status_code == 200: img_array = np.asarray(bytearray(icon_r.content), dtype=np.uint8) icon = cv2.imdecode(img_array, cv2.IMREAD_COLOR) found = soup.find(id="application") name = found('h1')[0].get_text() return name, icon class Entry: def __init__(self, url, name, icon): self.url = url self.name = name self.icon = icon self.icon_hash = None self.contours = None items = {} def _go(url): r = requests.get(url, headers = {'User-agent': 'Mozilla/5.0'}, verify=False) if r.status_code == 200: if url.startswith('https://itunes.apple.com'): name, icon = itunes_find(r.content) elif url.startswith('https://play.google.com'): name, icon = google_find(r.content) elif url.startswith('http://www.windowsphone.com'): name, icon = windows_find(r.content) if name and icon is not None: items[url] = Entry(url, name, icon) url_list = [ 'https://itunes.apple.com/en/app/skype-for-iphone/id304878510?mt=8', 'https://itunes.apple.com/en/app/skype-for-ipad/id442012681?mt=8', 'https://play.google.com/store/apps/details?id=com.skype.raider&hl=en', 'http://www.windowsphone.com/ru-ru/store/app/skype/c3f8e570-68b3-4d6a-bdbb-c0a3f4360a51', 'https://play.google.com/store/apps/details?id=com.skype.android.access&hl=en', 'https://itunes.apple.com/en/app/skype-wifi/id444529922?mt=8', 'https://play.google.com/store/apps/details?id=com.skype.android.qik&hl=en', 'https://itunes.apple.com/us/app/skype-qik-group-video-messaging/id893994044?mt=8', 'https://play.google.com/store/apps/details?id=com.viber.voip&hl=en', 'https://itunes.apple.com/en/app/viber/id382617920?mt=8', 'https://play.google.com/store/apps/details?id=com.viber.voip&hl=en', 'https://play.google.com/store/apps/details?id=com.ketchapp.skyward&hl=en', 'https://itunes.apple.com/us/app/skyward/id943273841?mt=8', 'https://play.google.com/store/apps/details?id=cz.george.mecheche&hl=en', ] tr = 100 def _do(): for u in url_list: _go(u) for item in items.itervalues(): width = item.icon.shape[0] height = item.icon.shape[1] icon_c = cv2.cvtColor(item.icon, cv2.COLOR_BGR2RGB) pil_im = Image.fromarray(icon_c) item.icon_hash = imagehash.dhash(pil_im) edges = cv2.Canny(item.icon, tr, tr*2) def _s(x): x,y,w,h = cv2.boundingRect(x) return (x, y) contours, hierarchy = cv2.findContours(edges, cv2.RETR_LIST, 1) contours = sorted(contours, key = _s) item.contours = contours item.weight = sum([cv2.arcLength(cnt,True) for cnt in contours]) matches = [] ungrouped = [] items_copy = items.values() while items_copy: group = [] item = items_copy[0] current = items_copy[1:] items_copy = [] for other in current: if item.icon_hash == other.icon_hash: group.append(other.url) else: rating = 0 count = min(len(item.contours), len(other.contours)) for v in range(count): result = cv2.matchShapes(item.contours[v], other.contours[v], 1, 0.0) if result < 0.15: l = cv2.arcLength(item.contours[v],True) lo = cv2.arcLength(other.contours[v],True) rating += min(l/item.weight, lo/other.weight) if rating > 0.8: group.append(other.url) else: items_copy.append(other) if group: group.append(item.url) matches.append(group) else: ungrouped.append(item.url) for v in matches: print 'Found group: %s'%', '.join(set([items[u].name.strip() for u in v])) print 'Urls:\n%s\n'%'\n'.join(v) print "Ungrouped:" for v in ungrouped: print 'Name %s'%items[v].name print 'Url %s'%v _do()