added imagscraper tool

this tool is used to scrape each anime cover from our database (image sourced from MAL) and save them locally.
author: daniel-Jones <daniel@danieljon.es> 2018-06-05 15:23:45 +0930
committer: daniel-Jones <daniel@danieljon.es> 2018-06-05 15:23:45 +0930
commit: 0a9220f18e0dd56314f9aa97b88dd03416c51f6c (patch)
tree: 2bb11e9ef0802429709fabfb3f60fa02ab80622d
parent: f888d0228fabc61ac026a17e82e82583f9a4390e (diff)
download: animedb-0a9220f18e0dd56314f9aa97b88dd03416c51f6c.tar.gz
animedb-0a9220f18e0dd56314f9aa97b88dd03416c51f6c.zip
1 files changed, 110 insertions, 0 deletions
diff --git a/imagescraper/imagescraper.py b/imagescraper/imagescraper.py
new file mode 100755
index 0000000..6c23a5d
--- /dev/null
+++ b/imagescraper/imagescraper.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+
+"""
+imagescraper.py is part of animedb.
+The purpose of this program is to scrape images from MAL for each anime inside our database.
+Copyright (C) 2018 Daniel Jones daniel@danieljon.es 
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+
+import argparse;
+import sqlite3;
+import os.path;
+import urllib.request;
+import re;
+
+def dbconnect(dbname):
+    """
+    connect to our database and return the object
+    """
+    try:
+        dbcon = sqlite3.connect(dbname);
+    except:
+        e = sys.exc_info()[0];
+        exit(e);
+    return dbcon; 
+
+def dbclose(db):
+    """
+    close database
+    """
+    db.close();
+
+def dbgetanimeids(db):
+    ids = [];
+    c = db.cursor();
+    for id in c.execute("SELECT animeid FROM anime;"):
+        ids.append(id[0]);
+    return (ids);
+
+def createlinks(ids):
+    links = [];
+    for anime in ids:
+        links.append("https://myanimelist.net/anime/{}".format(anime));
+    return links;
+
+def scrapelinks(animelinks):
+    """
+    scrape MAL link for the anime cover links
+    regex on html, what could go wrong?
+    """
+    links = [];
+    for link in animelinks:
+        src = urllib.request.urlopen(link).read().decode("utf-8");
+        for l in re.findall('https:\/\/myanimelist\.cdn-dena\.com\/images\/anime\/[0-9]*\/[0-9]*.jpg', src):
+            links.append(l);
+            break;
+    return links;
+
+def getcoverimage(link, animeid):
+    print("downloading {}".format(link));
+    urllib.request.urlretrieve(link, "covers/{}.jpg".format(animeid))
+
+if __name__ == "__main__":
+    """
+    retrieve anime id's from the database
+    construct urls to scrape
+    scrape url for cover image link
+    """
+
+    parser = argparse.ArgumentParser();
+    parser.add_argument("-d", "--database", type=str, action="store", dest="dbfile",
+            default="../userdb.db", required=True,
+            help="sqlite3 database file containing anime information");
+    args = parser.parse_args();
+
+    # if our database (file) doesn't exist, exit
+    if not os.path.isfile(args.dbfile):
+        exit("file doesn't exist {}".format(args.dbfile));
+
+    db = dbconnect(args.dbfile);
+
+    # collect anmie ids
+    animeids = dbgetanimeids(db);
+    # create MAL anime links
+    animelinks = createlinks(animeids);
+
+    # scrape links for the cover image link
+    print("scraping MAL pages... this may take some time...");
+    coverlinks = scrapelinks(animelinks);
+
+    # download cover images
+    x = 0;
+    for link in coverlinks:
+        getcoverimage(link, animeids[x]);
+        x += 1;
+
+    dbclose(db);
author	daniel-Jones <daniel@danieljon.es>	2018-06-05 15:23:45 +0930
committer	daniel-Jones <daniel@danieljon.es>	2018-06-05 15:23:45 +0930
commit	0a9220f18e0dd56314f9aa97b88dd03416c51f6c (patch)
tree	2bb11e9ef0802429709fabfb3f60fa02ab80622d
parent	f888d0228fabc61ac026a17e82e82583f9a4390e (diff)
download	animedb-0a9220f18e0dd56314f9aa97b88dd03416c51f6c.tar.gz animedb-0a9220f18e0dd56314f9aa97b88dd03416c51f6c.zip