From 0a9220f18e0dd56314f9aa97b88dd03416c51f6c Mon Sep 17 00:00:00 2001 From: daniel-Jones Date: Tue, 5 Jun 2018 15:23:45 +0930 Subject: added imagscraper tool this tool is used to scrape each anime cover from our database (image sourced from MAL) and save them locally. --- imagescraper/imagescraper.py | 110 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100755 imagescraper/imagescraper.py diff --git a/imagescraper/imagescraper.py b/imagescraper/imagescraper.py new file mode 100755 index 0000000..6c23a5d --- /dev/null +++ b/imagescraper/imagescraper.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 + +""" +imagescraper.py is part of animedb. +The purpose of this program is to scrape images from MAL for each anime inside our database. +Copyright (C) 2018 Daniel Jones daniel@danieljon.es + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +""" + +import argparse; +import sqlite3; +import os.path; +import urllib.request; +import re; + +def dbconnect(dbname): + """ + connect to our database and return the object + """ + try: + dbcon = sqlite3.connect(dbname); + except: + e = sys.exc_info()[0]; + exit(e); + return dbcon; + +def dbclose(db): + """ + close database + """ + db.close(); + +def dbgetanimeids(db): + ids = []; + c = db.cursor(); + for id in c.execute("SELECT animeid FROM anime;"): + ids.append(id[0]); + return (ids); + +def createlinks(ids): + links = []; + for anime in ids: + links.append("https://myanimelist.net/anime/{}".format(anime)); + return links; + +def scrapelinks(animelinks): + """ + scrape MAL link for the anime cover links + regex on html, what could go wrong? + """ + links = []; + for link in animelinks: + src = urllib.request.urlopen(link).read().decode("utf-8"); + for l in re.findall('https:\/\/myanimelist\.cdn-dena\.com\/images\/anime\/[0-9]*\/[0-9]*.jpg', src): + links.append(l); + break; + return links; + +def getcoverimage(link, animeid): + print("downloading {}".format(link)); + urllib.request.urlretrieve(link, "covers/{}.jpg".format(animeid)) + +if __name__ == "__main__": + """ + retrieve anime id's from the database + construct urls to scrape + scrape url for cover image link + """ + + parser = argparse.ArgumentParser(); + parser.add_argument("-d", "--database", type=str, action="store", dest="dbfile", + default="../userdb.db", required=True, + help="sqlite3 database file containing anime information"); + args = parser.parse_args(); + + # if our database (file) doesn't exist, exit + if not os.path.isfile(args.dbfile): + exit("file doesn't exist {}".format(args.dbfile)); + + db = dbconnect(args.dbfile); + + # collect anmie ids + animeids = dbgetanimeids(db); + # create MAL anime links + animelinks = createlinks(animeids); + + # scrape links for the cover image link + print("scraping MAL pages... this may take some time..."); + coverlinks = scrapelinks(animelinks); + + # download cover images + x = 0; + for link in coverlinks: + getcoverimage(link, animeids[x]); + x += 1; + + dbclose(db); -- cgit v1.2.3