-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeltasTXT_hashing_LOWMEM.py
134 lines (102 loc) · 3.77 KB
/
deltasTXT_hashing_LOWMEM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/python
import sys,gzip,zlib
import hashlib
import datetime
dataFieldSeparator='\t'
headerSeparator=','
tableName =sys.argv[1]
oldFileName =sys.argv[2]
newFileName =sys.argv[3]
headerFileName =sys.argv[4]
eventAction={}
oldContentHash={}
oldContentData={}
updContentData={}
updContentHeader={}
################################################################################
### READING PART ###############################################################
################################################################################
#print datetime.datetime.now()
fileIn = gzip.open(oldFileName, 'r')
for thisLine in fileIn:
# Get rid of non ID lines
if not thisLine[0].isdigit():
continue
thisId=thisLine.split(dataFieldSeparator)[0]
oldContentHash[thisId]=hashlib.md5(thisLine).hexdigest()
oldContentData[thisId]=zlib.compress( thisLine.rstrip('\n') )
eventAction[thisId]='D'
fileIn.close()
#print eventAction
# Header, write once execute everywhere :-D
headerFile = open(headerFileName, 'r')
headerString=headerFile.readline().rstrip('\n')
headerFile.close()
headerList=headerString.split(headerSeparator)
headerLen=len(headerList)
#print "new"
#print datetime.datetime.now()
fileIn = gzip.open(newFileName, 'r')
for thisLine in fileIn:
# Get rid of non ID lines
if not thisLine[0].isdigit():
continue
thisId=thisLine.split(dataFieldSeparator)[0]
# Check if exists
if eventAction.has_key(thisId):
# if so, same data?
if oldContentHash[thisId] == hashlib.md5(thisLine).hexdigest():
del eventAction[thisId]
#del oldContentHash[thisId]
del oldContentData[thisId]
continue
else:
# UPDATE
eventAction[thisId]='U'
#updContentData[thisId]=zlib.compress( thisLine.rstrip('\n') )
# Just save modified fields
oldData = zlib.decompress ( oldContentData[thisId] ).split(dataFieldSeparator)
thisData=thisLine.rstrip('\n').split(dataFieldSeparator)
thisOldData=''
thisNewData=''
thisHeader=''
for fieldIdx in range(len(oldData)):
if oldData[fieldIdx] != thisData[fieldIdx]:
thisOldData = thisOldData + oldData[fieldIdx] + dataFieldSeparator
thisNewData = thisNewData + thisData[fieldIdx] + dataFieldSeparator
thisHeader = thisHeader + headerList[fieldIdx] + headerSeparator
oldContentData[thisId]=zlib.compress( thisOldData.rstrip(dataFieldSeparator) )
updContentData[thisId]=zlib.compress( thisNewData.rstrip(dataFieldSeparator) )
updContentHeader[thisId] = thisHeader.rstrip(headerSeparator)
else:
# new
eventAction[thisId]='I'
oldContentData[thisId]=zlib.compress( thisLine.rstrip('\n') )
del oldContentHash
fileIn.close()
#print eventAction
for thisId,thisAction in eventAction.items():
print thisAction,
print thisId,
if thisAction == 'D':
print '\t',
print 'OLD ',
newData = zlib.decompress ( oldContentData[thisId] ).split(dataFieldSeparator)
for fieldIdx in range(0,len(headerList)):
print headerList[fieldIdx]+"="+newData[fieldIdx]+'\t',
if thisAction == 'I':
print 'NEW ',
newData = zlib.decompress ( oldContentData[thisId] ).split(dataFieldSeparator)
for fieldIdx in range(0,len(headerList)):
print headerList[fieldIdx]+"="+newData[fieldIdx]+'\t',
if thisAction == 'U':
oldData = zlib.decompress ( oldContentData[thisId] ).split(dataFieldSeparator)
updData = zlib.decompress ( updContentData[thisId] ).split(dataFieldSeparator)
thisHeader=updContentHeader[thisId].split(headerSeparator)
for fieldIdx in range(0,len(thisHeader)):
print thisHeader[fieldIdx]+" OLD VAL="+oldData[fieldIdx]+" NEW VAL="+updData[fieldIdx]+"\t",
del updContentData[thisId]
del oldContentData[thisId]
print
#print
#print datetime.datetime.now()