Intelligent directory copy

#conform.py        (version 0.1, partly tested only)
#smart directory copying in Python
#This solves the following problem:
#there are two directories Master and Slave, presumably on different drives.
#Slave is meant to be a copy of Master and remain so. They both contain huge files (for example bitmaps). 
#From time to time Master is reorganised (files are moved) and some new files are added,
# files may even be deleted.
#Filenames (without the initial directory) are supposed unique in general
# two files should not bear the same names in different subdirectories, unless they are part of a list of exceptions.
# Exceptions are due to a common phenomenon: readme.txt files and the like might have the same name in lots of subdirectories.
# This set of exceptions can be furnished at the start of the script call.
#This script maintains the conformity of Master and Slave.
#How does it work?
# A dictionnary of files from the Master is created. The keys are file names. The values are relative directories.
# Errors can be found: duplicate files. The process stops when all duplicate files are found.
# A dictionnary of files from the Slave is created. Same structure.
# directories in master that are not in slave are created in slave
# files are moved within slave to make the slave look like the structure in master.
# new files from master are copied into slave
# old files are removed from slave.
# old directories are removed from the slave.


import os.path
import os
import stat
import sys

dbg = 1
#returns an int
def file_date(filepath):
	statv = os.stat(filepath)
	return statv[stat.ST_MTIME]

def copy_file(from_file, to):
	ifh = open(from_file,"rb")
	ofh = open(to,"wb")
	while 1:
		block = ifh.read(4096)
		if not block:
			break
		ofh.write(block)
	ifh.close()
	ofh.close()
	
class path_info:
	def __init__(self, name, path, exceptional_files):
		self.name = name
		self.root = path
		self.rootlen = len(path)
		self.files_dir_dict = {}#contains just one relative dir for a basename -- do we really need this?? Not absolutely!!
		self.duplicates = {}#contains all relative dirs for a basename, but only if there are duplicates
		self.has_duplicates = 0
		self.exceptional_files = exceptional_files #a list of file basenames which can appear in lots of subdirs 
		
	def add_file(self, basename, rel_dir):
		dd = self.duplicates
		df = self.files_dir_dict
		if df.has_key(basename):
			self.has_duplicates = 1
			if dd.has_key(basename):
				dd[basename].append(rel_dir)
			else:
				dd[basename]=[df[basename],rel_dir]
				df[basename]=rel_dir
		else:
			df[basename] = rel_dir
	
exceptional_files = ["readme.txt"]#just a sample list

#used as a "visit" function for os.path.walk
def dir_add_dict(arg, dirname, names):
#  arg is a path_info instance
	relative_dir = dirname[arg.rootlen+1:]
	for basename in names:
		if os.path.isfile(os.path.join(dirname,basename)):
			if not basename in arg.exceptional_files:
				arg.add_file(basename, relative_dir)
				

def build_dic_master(rootpath, exceptions):
	master_info = path_info("master",rootpath,exceptions)
	os.path.walk(rootpath,dir_add_dict,master_info)
	duplicates = master_info.duplicates.keys()
	if duplicates <> []:
		print "Duplicates in master"
		for basename in duplicates:
			print "file:",basename
			ddirs = master_info.duplicates[basename]
			print "found in these directories:"
			for dir in ddirs:
				sys.stdout.write('"%s" '%dir)
			print ""
		return None
	else:
		return master_info
def build_dic_slave(rootpath,exceptions):
	slave_info = path_info("slave",rootpath,exceptions)
	os.path.walk(rootpath,dir_add_dict,slave_info)
	duplicates = slave_info.duplicates.keys()
	if duplicates <> []:
		print "Duplicates in slave"
		for basename in duplicates:
			print basename
			ddirs = slave_info.duplicates[basename]
			for dir in ddirs:
				sys.stdout.write("%s  "%dir)
			print ""
		return None
	else:
		return slave_info

def create_subdirs(arg, curr_indir, entries):
	(out_rootpath,in_rootpath_len) = arg
	for entry in entries:
		if os.path.isdir(curr_indir+"\\"+entry):
			out_dir = os.path.join(out_rootpath,curr_indir[in_rootpath_len+1:],entry)
			if not os.path.exists(out_dir):
				os.mkdir(out_dir)
				
def copy_directory_structure(in_rootpath, out_rootpath):
	in_rootpath_len = len(in_rootpath)
	os.path.walk(in_rootpath, create_subdirs, (out_rootpath,in_rootpath_len))

def move_files(master_info,slave_info):
	dict_master = master_info.files_dir_dict
	dict_slave = slave_info.files_dir_dict
	#first move/delete files in slave
	for file in dict_slave:
		rel_dir_slave = dict_slave[file]
		old_file = os.path.join(slave_info.root,rel_dir_slave,file)
		if dict_master.has_key(file):
			rel_dir_master = dict_master[file]
			new_file = os.path.join(slave_info.root,rel_dir_master,file)
			os.rename(old_file,new_file)
		else:
			os.remove(old_file)#rather brutal, are you sure? you could create a batch of files to remove, or
			#you could put them aside....
	#now copy new files from master 
	for file in dict_master:
		#if older in slave then copy over
		#if not in slave then copy new todo!!
		rel_dir = dict_master[file]
		full_path_master = os.path.join(master_info.root, rel_dir, file)
		full_path_slave = os.path.join(slave_info.root,rel_dir, file)
		if not os.path.exists(full_path_slave):
			copy_file(full_path_master, full_path_slave)
		else:
			slave_date = file_date(full_path_slave)
			master_date = file_date(full_path_master)
			if slave_date < master_date:
				copy_file(full_path_master, full_path_slave)
		
			
def remove_excess_slave_dirs1(arg, curr_slavedir, entries):
	(master_rootpath, master_rootpath_len) = arg
	for entry in entries:
		if os.path.isdir(entry):
			master_dir = os.path.join(master_rootpath,curr_slavedir[master_rootpath_len+1:],entry)
			if not os.path.exists(master_dir):
				os.rmdir(os.path.join(curr_slavedir,entry))
		
def remove_excess_slave_dirs(master_rootpath, slave_rootpath):
	os.path.walk(slave_rootpath, remove_excess_slave_dirs1,(master_rootpath, len(master_rootpath)))
#top level call	
def conform(master_rootpath, slave_rootpath, exceptions):
	copy_directory_structure(master_rootpath, slave_rootpath)
	master_info = build_dic_master(master_rootpath, exceptions)
	if not master_info:
		return
	slave_info = build_dic_slave(slave_rootpath, exceptions)
	if not slave_info:
		return
	move_files(master_info, slave_info)#bug shows up here...
	remove_excess_slave_dirs(master_rootpath, slave_rootpath)

#modify this
conform(r"c:\hdef\test\master",r"c:\hdef\test\slave",["readme.txt"])

print "finished processing"