prog:python_data_loader_oracle_bfile_text_index
Unterschiede
Hier werden die Unterschiede zwischen zwei Versionen angezeigt.
Beide Seiten der vorigen RevisionVorhergehende Überarbeitung | |||
prog:python_data_loader_oracle_bfile_text_index [2016/04/17 20:02] – gpipperr | prog:python_data_loader_oracle_bfile_text_index [2016/04/24 21:47] (aktuell) – [Code DocumentLoader.py (Python 3.4 !)] gpipperr | ||
---|---|---|---|
Zeile 1: | Zeile 1: | ||
+ | =====Mit Python 3.4 Referenzen alle Dateien in einer Verzeichnisstruktur in der Oracle Datenbank speichern===== | ||
+ | |||
+ | **Aufgabe: | ||
+ | |||
+ | Eine Verzeichnisstruktur auf einen Windows Server soll über Oracle Text in der Oracle Datenbank indiziert werden. | ||
+ | |||
+ | Dazu muss eine Referenz auf jede Datei in der Datenbank gespeichert werden. | ||
+ | |||
+ | Beim Laden soll auch der Datei Typ, die Datei Endung, der MIME Type, der MD5 Hash, die Größe der Datei und die Datumsfelder der Datei bestimmt und in die Ziel Tabelle geladen werden. | ||
+ | |||
+ | Später soll noch ein Vorschaubild und eine Zusammenfassung der Meta Daten der Datei, die Autor etc und die Sprache des Dokuments ermittelt werden. | ||
+ | |||
+ | |||
+ | Nach dem Laden werden die Daten mit Oracle Text indiziert, siehe dazu => [[dba: | ||
+ | |||
+ | |||
+ | Über eine APEX Oberfläche kann dann eine Suche über die Dateien durchgeführt werden siehe dazu => [[prog: | ||
+ | |||
+ | ---- | ||
+ | |||
+ | ==== Vorbereitung ==== | ||
+ | |||
+ | === Mit Unix Magic (file Befehl) den File Type bestimmen ==== | ||
+ | |||
+ | Leider ist es mir unter Windows unter Python nicht gelungen, Magic direkt einzusetzen. | ||
+ | |||
+ | Normalerweise müssen nur die Windows DDL' von Magic zur Verfügung stehen und mit python-magic Package von https:// | ||
+ | |||
+ | Leider hat das aber bei mir nicht funktioniert. | ||
+ | |||
+ | <code python> | ||
+ | .\python -m pip install python-magic | ||
+ | </ | ||
+ | |||
+ | Libs von herunterladen: | ||
+ | |||
+ | |||
+ | * http:// | ||
+ | * file über http:// | ||
+ | * Dependencies wie lib1.dll und regex2.dll über http:// | ||
+ | |||
+ | |||
+ | Test: | ||
+ | <code python> | ||
+ | import magic | ||
+ | magic_file_path= " | ||
+ | magic.Magic(magic_file =magic_file_path) | ||
+ | |||
+ | #Fehler: | ||
+ | ... | ||
+ | D: | ||
+ | D: | ||
+ | ... | ||
+ | #usw.. | ||
+ | </ | ||
+ | |||
+ | Trotz einer weitere Versuche und Internet Recherche ist es mir nicht gelungen das hinzubekommen. | ||
+ | |||
+ | Daher rufe ich nun magic über " | ||
+ | |||
+ | Gerade bei alten Office Formaten kann magic sehr hilfreich sein, es wird zum Beispiel auch der Autor mit ausgelesen. | ||
+ | |||
+ | === Directory Objekt in der Datenbank anlegen und Zugriffs-Rechte vergeben === | ||
+ | |||
+ | Über ein Directory Object wird der Einstieg in die Verzeichnisstruktur auf der Festplatte definiert. | ||
+ | |||
+ | In unseren Test Fall liegen die Daten unter " | ||
+ | |||
+ | Anlegen als SYS und Rechte vergeben: | ||
+ | |||
+ | <code sql> | ||
+ | sqlplus / as sysdba | ||
+ | |||
+ | #Directory anlegen | ||
+ | create directory INFO_ARCHIVE as ' | ||
+ | |||
+ | #Rechte vergeben | ||
+ | grant read,write on directory INFO_ARCHIVE to GPI; | ||
+ | </ | ||
+ | |||
+ | |||
+ | === Dokumenten Tabelle anlegen === | ||
+ | |||
+ | In diese Tabelle werden die Informationen eingelesen: | ||
+ | <code sql> | ||
+ | |||
+ | sqlplus gpi/gpi | ||
+ | |||
+ | --------------------------------------------------------- | ||
+ | -- Document table | ||
+ | --------------------------------------------------------- | ||
+ | |||
+ | CREATE TABLE Documents | ||
+ | ( | ||
+ | ID NUMBER (30) NOT NULL , | ||
+ | Filename | ||
+ | FileTyp | ||
+ | FileDirectory | ||
+ | FilePointer BFILE NOT NULL , | ||
+ | MD5Hash | ||
+ | FileCreateDate | ||
+ | FileLastModify | ||
+ | Language | ||
+ | CreateDate | ||
+ | CreateUser | ||
+ | ChangeDate | ||
+ | ChangeUser | ||
+ | Theme_data_avaiable VARCHAR2 (1 CHAR) DEFAULT ' | ||
+ | ) ; | ||
+ | |||
+ | CREATE UNIQUE INDEX IDX_Documents_ID_PK ON Documents ( ID ASC ); | ||
+ | |||
+ | ALTER TABLE Documents ADD CONSTRAINT Document_PK PRIMARY KEY ( ID ) ; | ||
+ | |||
+ | COMMENT ON TABLE Documents IS ' | ||
+ | COMMENT ON COLUMN Documents.ID IS ' | ||
+ | COMMENT ON COLUMN Documents.Filename IS 'Name of the file on disk' ; | ||
+ | COMMENT ON COLUMN Documents.FileTyp IS 'Typ of the file' ; | ||
+ | COMMENT ON COLUMN Documents.FileDirectory IS ' | ||
+ | COMMENT ON COLUMN Documents.FilePointer IS 'Bfile Pointer to the File' ; | ||
+ | COMMENT ON COLUMN Documents.MD5Hash IS 'Hash of the files to indentify dublicate files' ; | ||
+ | COMMENT ON COLUMN Documents.FileCreateDate IS 'File Create Time from the file' ; | ||
+ | COMMENT ON COLUMN Documents.FileLastModify IS 'Last modificatoin date from the file' ; | ||
+ | COMMENT ON COLUMN Documents.Language IS ' | ||
+ | COMMENT ON COLUMN Documents.CreateDate IS 'Date when the record was created' | ||
+ | COMMENT ON COLUMN Documents.CreateUser IS 'User create the record' | ||
+ | COMMENT ON COLUMN Documents.ChangeDate IS 'Last Change on the record' | ||
+ | COMMENT ON COLUMN Documents.ChangeUser IS 'User change the record' | ||
+ | COMMENT ON COLUMN Documents.Theme_data_avaiable IS 'If Themdata is there => Y, if not N' ; | ||
+ | |||
+ | --------------------------------------------------------- | ||
+ | |||
+ | create sequence documents_seq; | ||
+ | </ | ||
+ | |||
+ | |||
+ | |||
+ | ---- | ||
+ | |||
+ | === Oracle CX für den Datenbank Zugriff für Python 3.4 einrichten === | ||
+ | |||
+ | Siehe dazu => [[python: | ||
+ | |||
+ | |||
+ | ---- | ||
+ | |||
+ | |||
+ | ==== Code DocumentLoader.py (Python 3.4 !) ==== | ||
+ | |||
+ | Aufruf mit " | ||
+ | |||
+ | Beim ersten Aufruf wird das Template für die Konfigurationsdatei " | ||
+ | |||
+ | |||
+ | <code python DocumentLoader.py> | ||
+ | __author__ = ' | ||
+ | |||
+ | import datetime, time | ||
+ | import glob, filecmp, ntpath, shutil | ||
+ | import os, errno, sys, getopt | ||
+ | import hashlib | ||
+ | import subprocess | ||
+ | import io | ||
+ | import cx_Oracle | ||
+ | import locale | ||
+ | import configparser | ||
+ | |||
+ | |||
+ | # get FileType | ||
+ | def getFileType(file, | ||
+ | # magicFilePath = " | ||
+ | result = " | ||
+ | # Command + arguments | ||
+ | args = magicFilePath + " " + command + ' "' | ||
+ | # execute the program and pipe the output to subprocess.PIPE | ||
+ | prog = subprocess.Popen(args, | ||
+ | # wait until finished | ||
+ | stat = prog.wait() | ||
+ | # connect pipes together | ||
+ | (output, err) = prog.communicate() | ||
+ | result = str(output) | ||
+ | result = str.replace(result, | ||
+ | result = str.replace(result, | ||
+ | result = str.replace(result, | ||
+ | escFileName = str.replace(str(file), | ||
+ | result = str.replace(result, | ||
+ | return result | ||
+ | |||
+ | |||
+ | # Get extension | ||
+ | # Get extension | ||
+ | def getFile_ext(filename): | ||
+ | fparts = filename.split(' | ||
+ | c = len(fparts) | ||
+ | # try to get .tar.gz | ||
+ | v_return = fparts[-1] | ||
+ | if c > 3: | ||
+ | if len(fparts[-2]) < 5: | ||
+ | chars = set(' | ||
+ | if any((c in chars) for c in fparts[-2]): | ||
+ | v_return = fparts[-1] | ||
+ | else: | ||
+ | v_return = fparts[-2] + " | ||
+ | |||
+ | if v_return.find(' | ||
+ | v_return = " | ||
+ | |||
+ | return v_return.lower() | ||
+ | |||
+ | |||
+ | # get the MD5 Hash the file | ||
+ | def getMD5(filename): | ||
+ | blocksize = 65536 | ||
+ | if os.path.exists(filename) == False: | ||
+ | return ' | ||
+ | md5 = hashlib.md5() | ||
+ | with open(filename, | ||
+ | for block in iter(lambda: | ||
+ | md5.update(block) | ||
+ | return md5.hexdigest() | ||
+ | |||
+ | |||
+ | # Remember the global Size of all copied files | ||
+ | def setStatisticTotalSize(size): | ||
+ | global totalFileSize | ||
+ | totalFileSize += size | ||
+ | |||
+ | |||
+ | # global for the total filesize | ||
+ | totalFileSize = 0 | ||
+ | |||
+ | |||
+ | # print the file info | ||
+ | # later insert the record to the database | ||
+ | # | ||
+ | def insertFileInfo(fileInfo, | ||
+ | # strip the oracle path | ||
+ | relative_doc_path = str.replace(fileInfo[' | ||
+ | try: | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | except: | ||
+ | print(" | ||
+ | |||
+ | # insert into the database | ||
+ | |||
+ | # Cursor auf die DB oeffenen | ||
+ | cursor = connection.cursor() | ||
+ | |||
+ | # filename, | ||
+ | # .encode(' | ||
+ | # .encode(' | ||
+ | |||
+ | doc_rows = [(str(fileInfo[' | ||
+ | , fileInfo[' | ||
+ | , fileInfo[' | ||
+ | , fileInfo[' | ||
+ | , fileInfo[' | ||
+ | , fileInfo[' | ||
+ | , relative_doc_path + os.sep + str(fileInfo[' | ||
+ | , fileInfo[' | ||
+ | , fileInfo[' | ||
+ | ) | ||
+ | ] | ||
+ | |||
+ | # how many rows to insert | ||
+ | cursor.bindarraysize = len(doc_rows) | ||
+ | |||
+ | # datatype - lenght of the inserted data | ||
+ | # cursor.setinputsizes(int, | ||
+ | try: | ||
+ | # insert the whole record | ||
+ | cursor.executemany( | ||
+ | " | ||
+ | doc_rows) | ||
+ | except cx_Oracle.DatabaseError as e: | ||
+ | connection.rollback() | ||
+ | print(" | ||
+ | except Exception as e: | ||
+ | print(" | ||
+ | connection.rollback() | ||
+ | # raise | ||
+ | connection.commit() | ||
+ | |||
+ | |||
+ | # Main Script part | ||
+ | def main(argv): | ||
+ | global totalFileSize | ||
+ | |||
+ | |||
+ | # Parameter 1 - Import Directory | ||
+ | # Parameter 2 - Subfolder Level | ||
+ | |||
+ | |||
+ | # Remember the start time of the program | ||
+ | start_time = time.clock() | ||
+ | |||
+ | usage_string = " DocumentLoader.py | ||
+ | |||
+ | path_name = ' | ||
+ | dest_name = ' | ||
+ | recursiveLevel = 0 | ||
+ | config_path = ' | ||
+ | |||
+ | # -- Parameter from the config file | ||
+ | |||
+ | oracle_port = ' | ||
+ | oracle_host = ' | ||
+ | oracle_service = ' | ||
+ | oracle_user = ' | ||
+ | oracle_pwd = ' | ||
+ | magicFilePath = ' | ||
+ | ingoreFileExt = [] | ||
+ | |||
+ | # Path of the oracle Info Archive | ||
+ | ora_dir_path = ' | ||
+ | |||
+ | try: | ||
+ | opts, args = getopt.getopt(argv, | ||
+ | except getopt.GetoptError: | ||
+ | print(" | ||
+ | sys.exit(2) | ||
+ | # read the parameter | ||
+ | for opt, arg in opts: | ||
+ | if opt == ' | ||
+ | print(" | ||
+ | sys.exit() | ||
+ | elif opt in (" | ||
+ | path_name = arg | ||
+ | elif opt in (" | ||
+ | ora_dir_path = arg | ||
+ | elif opt in (" | ||
+ | config_path = arg | ||
+ | elif opt in (" | ||
+ | recursiveLevel = int(arg) | ||
+ | |||
+ | # read the config file | ||
+ | config = configparser.ConfigParser() | ||
+ | # check if the file exits | ||
+ | if os.path.exists(config_path) == False: | ||
+ | # use normal configparser to write the template | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | config[' | ||
+ | config[' | ||
+ | ' | ||
+ | with open(config_path, | ||
+ | config.write(configfile) | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | sys.exit(2) | ||
+ | else: | ||
+ | print(" | ||
+ | config.read(config_path) | ||
+ | # Parameter of the application | ||
+ | general_configuration = config[' | ||
+ | magicFilePath = general_configuration[' | ||
+ | ingoreFileExtString = general_configuration[' | ||
+ | ingoreFileExt = str.split(str.replace(ingoreFileExtString, | ||
+ | # Oracle DB Connect | ||
+ | oracle_db_configuration = config[' | ||
+ | oracle_port = oracle_db_configuration[' | ||
+ | oracle_host = oracle_db_configuration[' | ||
+ | oracle_service = oracle_db_configuration[' | ||
+ | oracle_user = oracle_db_configuration[' | ||
+ | oracle_pwd = oracle_db_configuration[' | ||
+ | |||
+ | |||
+ | # check if Directory exists and if the * is necessary | ||
+ | # BUG ! if more then 1 then the * not match the documents on root level?? | ||
+ | # FIX IT! | ||
+ | # Source | ||
+ | if os.path.isdir(path_name): | ||
+ | if path_name.endswith(os.path.sep): | ||
+ | path_name += (" | ||
+ | path_name += " | ||
+ | else: | ||
+ | path_name += os.path.sep | ||
+ | path_name += (" | ||
+ | path_name += " | ||
+ | else: | ||
+ | print(" | ||
+ | print(" | ||
+ | sys.exit(2) | ||
+ | |||
+ | # connect to the database | ||
+ | print(" | ||
+ | print(" | ||
+ | # get the connection to the database | ||
+ | print(" | ||
+ | oracle_host, | ||
+ | connection = cx_Oracle.connect( | ||
+ | oracle_user + '/' | ||
+ | # Version der DB ausgeben | ||
+ | print(" | ||
+ | print(" | ||
+ | |||
+ | print(" | ||
+ | locale.getdefaultlocale()[1])) | ||
+ | |||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | |||
+ | fileCount = 0 | ||
+ | fileExistsCount = 0 | ||
+ | dirCount = 0 | ||
+ | dirPathList = [] | ||
+ | totalFileSize = 0 | ||
+ | # Get the list of all Files | ||
+ | fileList = glob.glob(path_name) | ||
+ | |||
+ | # remove Thumbs.db if exist from the list | ||
+ | thumbsDBFile = " | ||
+ | for file in fileList: | ||
+ | if file.endswith(thumbsDBFile): | ||
+ | fileList.remove(file) | ||
+ | |||
+ | # Loop one read files in Import Directory | ||
+ | for file in fileList: | ||
+ | fileCount += 1 | ||
+ | |||
+ | # do the work | ||
+ | try: | ||
+ | # check if directoy | ||
+ | if ntpath.isdir(file): | ||
+ | print(" | ||
+ | dirCount += 1 | ||
+ | else: | ||
+ | # get only the filename without the path | ||
+ | filename = ntpath.basename(file) | ||
+ | # get directory | ||
+ | fileDirectoryName = ntpath.dirname(file) | ||
+ | # get Create date | ||
+ | |||
+ | fileAccessDate = datetime.datetime.fromtimestamp(ntpath.getatime(file)) | ||
+ | fileModDate = datetime.datetime.fromtimestamp(ntpath.getmtime(file)) | ||
+ | fileCreateDate = datetime.datetime.fromtimestamp(ntpath.getctime(file)) | ||
+ | # get md5 hash | ||
+ | md5checkSum = getMD5(file) | ||
+ | # get File Type over file from external, Python Lib magic not working, error with magic file! | ||
+ | # now I implement this stupid solution | ||
+ | fileType = getFileType(file, | ||
+ | # Call file with -i to get the mime Type | ||
+ | fileMimeType = getFileType(file, | ||
+ | # get Extenstion | ||
+ | fileExt = getFile_ext(filename) | ||
+ | # getFileSize | ||
+ | fileSize = os.path.getsize(file) | ||
+ | # Remember for statistic | ||
+ | setStatisticTotalSize(fileSize) | ||
+ | |||
+ | # not add url files to the index | ||
+ | # endswith(" | ||
+ | |||
+ | if fileExt in ingoreFileExt: | ||
+ | # encode the output with UTF-8 to avoid errors with stange things in filenames | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | else: | ||
+ | # record | ||
+ | fileInfo = dict(filename=str(filename), | ||
+ | fileMDate=fileModDate, | ||
+ | fileBType=fileType, | ||
+ | fileMtype=fileMimeType, | ||
+ | |||
+ | ## | ||
+ | # encode the output with UTF-8 to avoid errors with stange things in filenames | ||
+ | ## | ||
+ | # write to DB | ||
+ | insertFileInfo(fileInfo, | ||
+ | |||
+ | except OSError as exception: | ||
+ | if exception.errno != errno.EEXIST: | ||
+ | print(" | ||
+ | |||
+ | |||
+ | # print statistics | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | print(" | ||
+ | |||
+ | # Close the DB Connection | ||
+ | connection.close() | ||
+ | |||
+ | |||
+ | if __name__ == " | ||
+ | main(sys.argv[1: | ||
+ | |||
+ | |||
+ | </ | ||
+ | |||
+ | Config file: | ||
+ | <code bash dataLoader.conf> | ||
+ | [DEFAULT] | ||
+ | # Path and name of the magic file program | ||
+ | MagicFile=d: | ||
+ | |||
+ | # Do Not read files with this extension | ||
+ | # use , to separate the file extensions | ||
+ | ignoreFileExt=.cmd ,.iso , | ||
+ | |||
+ | [ORACLE_DB_CONNECT] | ||
+ | Port = 1521 | ||
+ | Host = 10.10.10.1 | ||
+ | Service = GPI | ||
+ | DB_User= GPI | ||
+ | DB_Password = xxxxxx | ||
+ | |||
+ | </ | ||
+ | |||
+ | |||
prog/python_data_loader_oracle_bfile_text_index.txt · Zuletzt geändert: 2016/04/24 21:47 von gpipperr