#!/usr/bin/env python """ The Pirate Bay scraper - This is the client Uses 3 external libraries for scraping HTML elements from ThePirateBay and interacting with transmission-daemon. Asks user for a search selection, offers a list of choices, and grabs the magnet link for the selection in a menu style fashion. """ __author__ = 'LANCE - https://github.com/lalanza808' ################################################## # Libraries # Built-in libraries from os import path import argparse # 3rd party libraries import requests import bs4 import transmissionrpc ################################################## # Variables # Torrent server IP; can be any machine running transmission-daemon # with a firewall inbound allowed to TCP/9091 (transmissionrpc) rpcserver = [] # Dictionaries/Arrays for storing search results tpb_search_results = {} tpb_torrent_links = [] user_torrent_selection = "" # Current/working PirateBay URL tpb = "https://thepiratebay.org" # Squelch HTTPS insecure warnings requests.packages.urllib3.disable_warnings() ################################################## # Parsing and Arguments parser = argparse.ArgumentParser(description='Scrape The Pirate Bay for torrents.') parser.add_argument('--query', '-q', dest='arg_search_string', help='The query string to search for on TPB', required=False) parser.add_argument('--top', '-t', dest='arg_take_top', action='store_true', help='Automatically grab the torrent with most seeds', required=False) parser.add_argument('--file', '-f', dest='arg_magnet_link', help='Direct link to magnet link or torrent file', required=False) parser.add_argument('--url', '-u', dest='arg_torrent_page', help='URL of the torrent on TPB', required=False) parser.add_argument('--server', '-s', dest='arg_server_host', help='IP/hostname of the Transmission server', required=False) args = parser.parse_args() ################################################## # Functions #1 def Check_Transmission_Listener(): """ Checks to see if transmission-daemon is listening on rpcserver and initiates the function to ask user for input """ if not rpcserver and not args.arg_server_host: print "No Transmission server specified! Quitting!" exit(9) elif args.arg_server_host: rpcserver.append(args.arg_server_host) try: transmissionrpc.Client(rpcserver[0], port=9091) Get_Search_URL() except KeyboardInterrupt: print "\n\nBye." exit(1) except transmissionrpc.error.TransmissionError: print "[!] Transmission-daemon not listening on {}!".format(rpcserver[0]) exit(2) #2 def Get_Search_URL(): """ Takes input string to search for on TPB. Formats string into proper url Gets HTML source of search page for use in the next function """ #If magnet link supplied, directly add to queue, exit script if args.arg_magnet_link: transmissionrpc.Client(rpcserver[0]).add_torrent(args.arg_magnet_link) exit(0) #If URL supplied, skip to Download_Torrent_From_URL function elif args.arg_torrent_page: Download_Torrent_From_URL(args.arg_torrent_page) exit(0) #If search string provided, use it for Get_Torrent_Links function elif args.arg_search_string: tpb_search_string = args.arg_search_string #If nothing supplied, ask user for search string else: tpb_search_string = raw_input("[+] What would you like to search?\n>>> ") tpb_search_url = "{}/search/{}/0/7/0".format(tpb, tpb_search_string) #/0/7/0 tells TPB to sort descending by seeds tpb_torrent_page_source = requests.get(tpb_search_url, verify=False).text #Use requests lib to fetch page source for bs4 parsing Get_Torrent_Links(tpb_torrent_page_source) #Run Get_Torrent_Links function, passing page source for BS4 parsing #3 def Get_Torrent_Links(source): """ Takes the page source and parses it with BeautifulSoup. Finds all anchor elements on the page, pre-sorted by seeders Enumerates list of elements, and adds them to tpb_search_results dictionary """ print "\n" global tpb_torrent_links, tpb_search_results #Update the tpb_torrent_links array with the returned torrents tpb_torrent_page_soup = bs4.BeautifulSoup(source, "html.parser") #Create Beautiful Soup object for link in tpb_torrent_page_soup.find_all('a'): #Find all anchor elements in page source if link.get('href').startswith('/torrent'): #Only get links with /torrent as they're valid torrent pages tpb_torrent_links.append(link.get('href')) #Set the results to tpb_torrent_links array #If -t is supplied, bypass this section of code and go on to download the top torrent if args.arg_take_top and tpb_torrent_links: Download_Torrent_From_URL("{}/{}".format(tpb, tpb_torrent_links[0])) #Print links in numeric order to the user else: for number,link in enumerate(tpb_torrent_links): #Enumerate the array so the numbers start at 0 tpb_search_results.update({number:link}) #Append results to tpb_search_results dictionary print "({}) {}".format(number, path.basename(link)) if tpb_search_results: #If dict is not empty, continue with script print "\n(98) Search again" print "(99) Exit" Get_User_Selection() else: #If dict is empty (no results from search) re-run script print "\nNo results found. Try again." tpb_search_results = {} tpb_torrent_links = [] args.arg_search_string = '' Get_Search_URL() #Loop back to script start #4 def Get_User_Selection(): """ Asks for selection of torrent """ global tpb_torrent_links, tpb_search_results #Ask user for numeric selection try: selection = int(raw_input("\n[+] Enter the number of the torrent to download.\n>>> ")) #Zeroize variables, loop back to script start if selection == 98: print "\nStarting over" tpb_search_results = {} tpb_torrent_links = [] args.arg_search_string = '' Get_Search_URL() #Exit script elif selection == 99: print "\nBye.\n" exit() #If valid number, move to next function to add to queue elif selection in tpb_search_results: user_torrent_selection = tpb_search_results[selection] #Updates variable based on key provided above, matches it with tpb_search_results dict Download_Torrent_From_URL("{}/{}".format(tpb, user_torrent_selection)) #If anything other than 98, 99, or valid key number entered, loop back to selection input else: print "\nNot a valid number" Get_User_Selection() #If number isn't used, loop back to selection input except ValueError: print "\nThat is not a digit." Get_User_Selection() #5 def Download_Torrent_From_URL(tpb_torrent_url): """ Grabs the first magnet link and adds it to the queue via RPC to rpcserver """ tpb_magnet_links = [] tpb_torrent_page = requests.get(tpb_torrent_url, verify=False) tpb_torrent_page_soup = bs4.BeautifulSoup(tpb_torrent_page.content, "html.parser") for link in tpb_torrent_page_soup.find_all('a'): if str(link.get('href')).startswith('magnet:?xt'): tpb_magnet_links.append(link.get('href')) tpb_magnet_link = tpb_magnet_links[0] print "\n[+] Adding magnet link for torrent:\n\n{}".format(tpb_torrent_url) transmissionrpc.Client(rpcserver[0]).add_torrent(tpb_magnet_link) print "\n[.] Done!\n" exit(0) if __name__ == "__main__": Check_Transmission_Listener()