Proxy Blacklist Generator

Share on:

Proxy blacklist generator from alexa and open pagerank

  • This script generates proxy blacklists from categories and keywords, example list below can easily generate hundreds of thousands of domains. False positives can be dealt with by adding ignore keywords after the second hyphen i.e. gram triggers a false positive with programming, and with the pirate list music triggers a lot of pages that are not pirate sites, i.e. bandname.bandcamp.com. A significantly large free proxy blacklist is available on the homepage, created using this tool with a different category-keywords file.

buildlist

#!/bin/bash

while read category; do
        categoryname=$(echo $category | awk -F '-' '{print $1}')
        echo "Getting ${categoryname}"
        # Split $2 by ' ' and replace with /n into temp file
        # for each line of temp file, push to getlist with $categoryname
        strings=$(echo $category | awk -F '-' '{print $2}')
        falsepositives=$(echo $category | awk -F '-' '{print $3}')

        echo $strings | tr ' ' '\n' > tempStrings
        while read term; do
                echo "-    Getting ${term}"
                ./getlist.sh $categoryname $term
        done < tempStrings


        if [[ ! -z "$falsepositives" ]]
        then
                echo $falsepositives | tr ' ' '\n' > tempFalsePositives
                if [ -s tempStrings ]
                then
                        while read false; do
                                echo "-    Removing false positives..."
				file=$(echo $categoryname | sed -e 's/ //g')
                                sed -i "/$false/d" $file.list
                        done < tempFalsePositives
                fi
        fi

        # sort and clean up
        rm tempStrings 2> /dev/null
        rm tempFalsePositives 2> /dev/null
        echo "Sorting ${categoryname} domains"
        listname=$(echo ${categoryname}'.list' |sed -e 's/ //g')
        sortedlist=$(echo 'sorted'${categoryname}'.list' |sed -e 's/ //g')
        sort ${listname} > ${sortedlist}
        uniq ${sortedlist} > ${listname}
        rm $sortedlist
done < category-keywords

rm tempStrings 2> /dev/null
rm tempFalsePositives 2> /dev/null

getlist

#!/bin/sh

# $1 = category -> file to output to
# $2 = string -> search string for category


if [ ! -s top-1m.csv ]
then
	wget http://s3.amazonaws.com/alexa-static/top-1m.csv.zip
	unzip top-1m.csv.zip
	rm top-1m.csv.zip
fi

if [ ! -s top10mil.csv ]
then
	wget https://www.domcop.com/files/top/top10milliondomains.csv.zip
	unzip top10milliondomains.csv.zip
	rm top10milliondomains.csv.zip
	cp top10milliondomains.csv top10mil.csv
	rm top10milliondomains.csv
fi

cat top-1m.csv | awk -F ',' '{print $2}' | sort | grep "$2" >> ${1}.list
cat top10mil.csv | awk -F ',' '{print $2}' | sort | grep "$2" | sed -e 's/"//g' >> ${1}.list

example category-keywords file

veganism - animalcruelty vegan abolitionism animalrights speciesism
hacking - hacking hack crack hax exploit - shack
religion - religion christ jesus church bible sutra veda quran hadith tanach 
mishnah talmud midrash torah sahihalbukhari alfath
mail - mail
dating - dating date
social - facebook twitter linkedin instagram gram - programming
gaming - twitch counterstrike fortnite warcraft doda minecraft pubg oerwatch monsterhunter
witcher zelda destiny darksouls assassinscreed callofduty godofwar thewitcher elderscrolls 
sport - football soccer basketball tennis golf hockey olympics
pirate - music torrent downloader share youtubedownloader mp3search - bandcamp
gov - gov government
russia - \.ru \.su 
china - \.cn
hongkong - \.hk  
taiwan - \.tw
macau - \.mo