003《Python数据分析、挖掘与可视化(第2版)》/例5-5.py
from sys import argv
from os import listdir
from os.path import join, isfile, isdir
from docx import Document
from openpyxl import load_workbook
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE

def checkdocx(dstStr, fn):
    # 打开docx文档
    document = Document(fn)
    # 遍历所有段落文本
    for p in document.paragraphs:
        if dstStr in p.text:
            return True
    # 遍历所有表格中的单元格文本
    for table in document.tables:
        for row in table.rows:
            for cell in row.cells:
                if dstStr in cell.text:
                    return True
    return False

def checkxlsx(dstStr, fn):
    # 打开xlsx文件
    wb = load_workbook(fn)
    # 遍历所有工作表的单元格
    for ws in wb.worksheets:
        for row in ws.rows:
            for cell in row:
                try:
                    if dstStr in cell.value:
                        return True
                except:
                    pass
    return False

def checkpptx(dstStr, fn):
    # 打开pptx文档
    presentation = Presentation(fn)
    # 遍历所有幻灯片
    for slide in presentation.slides:
        for shape in slide.shapes:
            # 表格中的单元格文本
            if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
                for row in shape.table.rows:
                    for cell in row.cells:
                        if dstStr in cell.text_frame.text:
                            return True
            # 文本框
            elif shape.shape_type in (MSO_SHAPE_TYPE.TEXT_BOX,MSO_SHAPE_TYPE.PLACEHOLDER):
                try:
                    if dstStr in shape.text:
                        return True
                except:
                    pass
    return False

def main(dstStr, flag):
    # 使用广度优先的方式遍历当前文件夹及其所有子文件夹
    # 一个圆点表示当前文件夹
    dirs = ['.']
    while dirs:
        # 获取第一个尚未遍历的文件夹名称
        currentDir = dirs.pop(0)
        for fn in listdir(currentDir):
            path = join(currentDir, fn)
            if isfile(path):
                if path.endswith('.docx') and\
                   checkdocx(dstStr, path):
                    print(path)
                elif path.endswith('.xlsx') and\
                     checkxlsx(dstStr, path):
                    print(path)
                elif path.endswith('.pptx') and\
                     checkpptx(dstStr, path):
                    print(path)
            # 广度优先遍历目录树
            elif flag and isdir(path):
                dirs.append(path)


# argv[0]为程序文件名
# argv[1]表示是否要检查所有子文件夹中的文件
if argv[1] != '/s':
    dstStr = argv[1]
    flag = False
else:
    dstStr = argv[2]
    flag = True

main(dstStr, flag)