#!/usr/bin/python
#-*-coding:UTF-8 -*-
import sys
import cElementTree
debug = False#设置lzo文件偏移位
if debug:
lzo = 0
else:
lzo = 1
for line in sys.stdin:
try:
flags = line[:-1].split('\t')
#hadoop查询走标准输入,数据以\t分隔,去掉每行中的\n
if len(flags) == 0:
break
if len(flags) != 11+lzo:
#hadoop采用lzo则偏移位+1,lzo设置为False则+1
continue
stat_date=flags[0+lzo]#日期
stat_date_bar = stat_date[:4]+"-"+stat_date[4:6]+'-'+stat_date[6:8]#拼成2011-11-29格式
version = flags[4+lzo]
xmlstr = flags[10+lzo]
#xmlstr=line
dom = cElementTree.fromstring(xmlstr)
#xml字段对象,以下均为取值操作
uuid = dom.attrib['UUID']
node = dom.find('UserDoubleClick')
associateKey=node.get('AssociateKey')
associateKeys=associateKey.split('.')
player = associateKeys[0]
fileext=node.get('FileExt')
count=node.get('Count')
print stat_date_bar+','+version+','+fileext+','+player+','+associateKey+'\t'+count
#输出map后的数据,这里map不对数据做任何处理,只做取值,拼接操作
#将\t前的字符串作为key输入reduce,\t后的count作为reduce计算用的value
except Exception,e:
print e
#抛出异常