File: cleansite.py
"""
=======================================================================
use python's html and url parser libs to try to isolate and move
unused files in a web site directory; run me in the directory of
the site's root html file(s) (default=[index.html]);
this is heuristic: it assumes that referenced files are in this site
if they exist here; it also may incorrectly classify some files as
unused if they are referenced only from files which cause Python's
html parser to fail -- you should inspect the run log and unused file
directory manually after a run, to see if parse failures occurred;
more lenient html parsers exist for Python, but all seem 2.X-only;
other parse options might avoid failures too: re.findall() pattern
matches for '(?s)href="(.*?)"' and 'src=...'? (see Example 19-9);
see chapters 19 and 14 for html parsers, chapter 13 for url parsing;
to do: extend me to delete the unused files from remote site via ftp:
not done because unused files require verification if parse failures;
caveat: assumes site is one dir, doesn't handle subdirs (improve me);
=======================================================================
"""
import os, sys, html.parser, urllib.parse
def findUnusedFiles(rootfiles=['index.html'], dirunused='Unused', skipfiles=[]):
"""
find and move files referenced by rootfiles and by any html they
reach, ignoring any in skipfiles, and moving unused to dirunused;
"""
usedFiles = set(rootfiles)
for rootfile in rootfiles:
parseFileRefs(rootfile, usedFiles, skipfiles, 0)
moveUnusedFiles(usedFiles, dirunused)
return usedFiles
def moveUnusedFiles(usedFiles, dirunused, trace=print):
"""
move unused files to a temp directory
"""
print('-' * 80)
if not os.path.exists(dirunused): # tbd: clean if present?
os.mkdir(dirunused)
for filename in os.listdir('.'):
if filename not in usedFiles:
if not os.path.isfile(filename):
print('Not a file:', filename)
else:
trace('Moving...', filename)
os.rename(filename, os.path.join(dirunused, filename))
def parseFileRefs(htmlfile, usedFiles, skipFiles, indent, trace=print):
"""
find files referenced in root, recur for html files
"""
trace('%sParsing:' % ('.' * indent), htmlfile)
parser = MyParser(usedFiles, skipFiles, indent)
text = open(htmlfile).read()
try:
parser.feed(text)
except html.parser.HTMLParseError as E:
print('==>FAILED:', E) # file's refs may be missed!
parser.close()
class MyParser(html.parser.HTMLParser):
"""
use Python stdlib html parser to scan files; could nest this in
parseFileRefs for enclosing scope, but would remake class per call;
"""
def __init__(self, usedFiles, skipFiles, indent):
self.usedFiles = usedFiles
self.skipFiles = skipFiles
self.indent = indent
super().__init__() # vs html.parser.HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
"""
callback on tag open during parse: check links and images
"""
if tag == 'a':
url = [value for (name, value) in attrs if name.lower() == 'href']
if url:
self.notefile(url[0])
elif tag == 'img':
url = [value for (name, value) in attrs if name.lower() == 'src']
if url:
self.notefile(url[0])
def notefile(self, url):
"""
note used file found, and recur to a nested parse if html
"""
urlparts = urllib.parse.urlparse(url)
(scheme, server, filepath, parms, query, frag) = urlparts
filename = os.path.basename(filepath)
if (os.path.exists(filename) and # is it here?
filename not in self.skipFiles and # ignore it?
filename not in self.usedFiles): # skip repeats?
self.usedFiles.add(filename) # add in-place
if filename.endswith(('.html', '.htm')): # recur for html
parseFileRefs(
filename, self.usedFiles, self.skipFiles, self.indent + 3)
def deleteUnusedRemote(localUnusedDir, ftpsite, ftpuser, ftppswd, ftpdir='.'):
"""
to do: delete unused files from remote site too? see Chapter 13 for ftp;
not used because unused dir requires manual inspection if parse failures
"""
from ftplib import FTP
connection = FTP(ftpsite)
connection.login(ftpuser, ftppswd)
connection.cwd(ftpdir)
for filename in os.listdir(localUnusedDir):
connection.delete(filename)
if __name__== '__main__':
htmlroot = sys.argv[1] if len(sys.argv) > 1 else 'index.html'
moveto = sys.argv[2] if len(sys.argv) > 2 else 'PossiblyUnused'
ignore = sys.argv[3] if len(sys.argv) > 3 else 'whatsnew.html'
usedFiles = findUnusedFiles([htmlroot], moveto, [ignore])
moveFiles = os.listdir(moveto)
print('-' * 80)
print('**Summary**\n')
print('%d unused files moved to:\n\t%s\n' %
(len(moveFiles), os.path.abspath(moveto)))
print('%d used files in this site: ' % len(usedFiles))
for F in sorted(usedFiles): print('\t', F)
"""
if input('delete remotely?') in 'yY':
deleteUnusedRemote(moveto, input('site?'), input('user?'), input('pswd?'))
"""