Hadoop搭建

环境搭建

第一部分

  1. 主机名修改为 master

    1
    2
    3
    4
    5
    6
    7
    8
    9
    ifconfig
    vim /etc/hostname
    vim /etc/host
    hostname master
    sudo passwd root
    ssh root@127.0.0.1
    su root
    sudo ssh root@127.0.0.1
    <!-- sudo ssh parallels@10.211.55.3 -->
  2. ssh-kegen
    文章
    SSH
    Linux基本服务
    Mac下 如何配置虚拟机软件Pparallel Desktop–超详细
    30分钟zabbix入门

查看ssh安装

1
2
3
4
5
6
7
rpm -qa | grep openssh
sudo yum install openssh-clients
sudo yum install openssh-server
ssh root@127.0.0.1
ssh-keygen -t rsa
ssh-copy-id master
ssh master
  1. hadoop
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    tar -zxvf hadoop-2.6.0.tar.gz
    mv hadoop-2.6.0 /usr/local/hadoop

    tar -zxvf jdk-8u181-linux-x64.gz
    mv jdk1.8.0_181 /usr/local/jdk

    cd /usr/local/hadoop
    cd etc/hadoop
    # 配置 JAVAHOME
    vim hadoop-env.sh
  • core-site.xml
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    vim core-site.xml 
    <configuration>
    <property>
    <name>fs.default.name</name>
    <value>hdfs://master:9000</value>
    </property>
    <property>
    <name>hadoop.tmp.dir</name>
    <value>/usr/local/hadoop/tmp</value>
    </property>
    </configuration>
  • hdfs-site.xml
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    vim hdfs-site.xml
    <configuration>
    <property>
    <name>dfs.replication</name>
    <value>1</value>
    </property>
    <property>
    <name>dfs.permissions</name>
    <value>false</value>
    </property>
    </configuration>
  • mapred-site.xml
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    cp mapred-site.xml.template mapred-site.xml 
    vim mapred-site.xml
    <configuration>
    <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
    </property>
    <property>
    <name>mapreduce.jobhistory.address</name>
    <value>master:10020</value>
    </property>
    </configuration>
  • yarn-site.xml
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    vim yarn-site.xml 

    <configuration>
    <!-- Site specific YARN configuration properties -->
    <property>
    <name>yarn.resourcemanager.hostname</name>
    <value>master</value>
    </property>
    <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
    </property>
    <property>
    <name>mapreduce.job.ubertask.enable</name>
    <value>true</value>
    </property>
    </configuration>
  • 环境配置
    1
    2
    3
    4
    5
    6
    7
    8
    9
    vim /etc/profile
    export HADOOP_HOME=/usr/local/hadoop
    export PATH=.:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH
    source /etc/profile
    hadoop namenode -format
    cd hadoop/sbin
    start-dfs.sh
    start-yarn.sh
    systemctl stop firewalld.service
  • hadoop

10.211.55.3:50070

  • yarn

10.211.55.3:8088

  • hbase

    1
    2
    3
    4
    tar -zxvf hadoop-2.6.0.tar.gz 
    rm -rf hadoop-2.6.0
    tar -zxvf hbase-1.0.0-cdh5.5.1.tar.gz
    mv hbase-1.0.0-cdh5.5.1 /usr/local/hbase
  • 进入安装目录

    1
    2
    3
    cd /usr/local/hbase
    cd conf/
    vim hbase-site.xml
  • hbase-site.xml

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    <configuration>
    <property>
      <name>hbase.rootdir</name>
      <value>hdfs://master:9000/hbase</value>
    </property>
    <property>
      <name>hbase.cluster.distributed</name>
      <value>true</value>
    </property>
    <property>
      <name>hbase.zookeeper.quorum</name>
      <value>master</value>
    </property>
    <property>
      <name>dfs.replication</name>
      <value>1</value>
    </property>
    </configuration>
  • hbase-env.sh

    1
    2
    vim hbase-env.sh 
    export JAVA_HOME=/usr/local/jdk
  • regionservers

    1
    2
    3
    4
    5
    vim regionservers
    master

    ps aux |grep zookeeper
    ps aux |grep regionserver
  • profile

    1
    2
    3
    4
    5
    6
    7
    8
    vim /etc/profile
    export HBASE_HOME=/usr/local/hbase
    export HADOOP_HOME=/usr/local/hadoop
    export JAVA_HOME=/usr/local/jdk
    export ZK_HOME=/usr/local/zk
    export PATH=.:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$JAVA_HOME/bin:$ZK_HOME/bin:$HBASE_HOME/bin:$PATH

    source /etc/profile
  • 查看

    1
    2
    3
    jps
    start-hbase.sh
    10.211.55.3:60010
  • hbase shell

    1
    2
    3
    4
    5
    6
    hbase shell

    create 'userscanlog','info'
    create 'pindaoanaly','info'

    list
  • create ‘userscanlog’,’info’

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    hbase(main):001:0> create 'userscanlog','info'

    ERROR: Can't get master address from ZooKeeper; znode data == null

    Here is some help for this command:
    Creates a table. Pass a table name, and a set of column family
    specifications (at least one), and, optionally, table configuration.
    Column specification can be a simple string (name), or a dictionary
    (dictionaries are described below in main help output), necessarily
    including NAME attribute.
    Examples:

    Create a table with namespace=ns1 and table qualifier=t1
    hbase> create 'ns1:t1', {NAME => 'f1', VERSIONS => 5}

    Create a table with namespace=default and table qualifier=t1
    hbase> create 't1', {NAME => 'f1'}, {NAME => 'f2'}, {NAME => 'f3'}
    hbase> # The above in shorthand would be the following:
    hbase> create 't1', 'f1', 'f2', 'f3'
    hbase> create 't1', {NAME => 'f1', VERSIONS => 1, TTL => 2592000, BLOCKCACHE => true}
    hbase> create 't1', {NAME => 'f1', CONFIGURATION => {'hbase.hstore.blockingStoreFiles' => '10'}}

    Table configuration options can be put at the end.
    Examples:

    hbase> create 'ns1:t1', 'f1', SPLITS => ['10', '20', '30', '40']
    hbase> create 't1', 'f1', SPLITS => ['10', '20', '30', '40']
    hbase> create 't1', 'f1', SPLITS_FILE => 'splits.txt', OWNER => 'johndoe'
    hbase> create 't1', {NAME => 'f1', VERSIONS => 5}, METADATA => { 'mykey' => 'myvalue' }
    hbase> # Optionally pre-split the table into NUMREGIONS, using
    hbase> # SPLITALGO ("HexStringSplit", "UniformSplit" or classname)
    hbase> create 't1', 'f1', {NUMREGIONS => 15, SPLITALGO => 'HexStringSplit'}
    hbase> create 't1', 'f1', {NUMREGIONS => 15, SPLITALGO => 'HexStringSplit', REGION_REPLICATION => 2, CONFIGURATION => {'hbase.hregion.scan.loadColumnFamiliesOnDemand' => 'true'}}

    You can also keep around a reference to the created table:

    hbase> t1 = create 't1', 'f1'

    Which gives you a reference to the table named 't1', on which you can then
    call methods.

异次元

  • MySQL-5.5.53
  1. 删除linux上已经安装的mysql相关库信息。rpm -e xxxxxxx –nodeps
    执行命令rpm -qa |grep mysql 检查是否删除干净
    1
    2
    3
    4
    rpm -qa |grep mysql
    rpm -ivh mysql.rpm
    rpm -qa |grep mariadb
    rpm -e --nodeps mariadb-libs
  2. 执行命令 rpm -i mysql-server-**** 安装mysql服务端
    1
    2
    3
    4
    5
    6
    7
    tar -xvf MySQL-5.5.53-1.linux2.6.x86_64.rpm-bundle.tar 
    rpm -ivh MySQL-server-5.5.53-1.linux2.6.x86_64.rpm
    /usr/bin/mysqladmin -u root password 'new-password'
    /usr/bin/mysqladmin -u root -h master password 'new-password'

    Alternatively you can run:
    /usr/bin/mysql_secure_installation
  3. 启动mysql 服务端,执行命令 mysqld_safe &
    (注意要把这个服务起来再安装客户端和第5步,不起来的话,第5步会报错的吧)
    1
    mysqld_safe &
  4. 执行命令 rpm -i mysql-client-**** 安装mysql客户端
    1
    rpm -ivh MySQL-client-5.5.53-1.linux2.6.x86_64.rpm
  5. 执行命令mysql_secure_installation设置root用户密码
    1
    2
    3
    4
    5
    mysql_secure_installation

    mysql -u root -p admin(mysql -p)
    grant all on hive.* to 'root'@'%' identified by 'my81527';
    flush privileges

hive 安装

  1. unzip

    1
    tar -zxvf apache-hive-1.0.0-bin.tar.gz
  2. mv

    1
    2
    3
    mv apache-hive-1.0.0-bin /usr/local/hive
    cd /usr/local/hive/
    cd conf/
  3. hive-site.xml

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    <?xml version="1.0"?>
    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
    <configuration>
    <property>
    <name>javax.jdo.option.ConnectionURL</name>
    <value>jdbc:mysql://test1:3306/hive?createDatabaseIfNotExist=true</value>
    </property>
    <property>
    <name>javax.jdo.option.ConnectionDriverName</name>
    <value>com.mysql.jdbc.Driver</value>
    </property>
    <property>
    <name>javax.jdo.option.ConnectionUserName</name>
    <value>root</value>
    </property>
    <property>
    <name>javax.jdo.option.ConnectionPassword</name>
    <value>123456</value>
    </property>
    <property>
    <name>hive.metastore.warehouse.dir</name>
    <value>/user/hive/warehouse</value>
    </property>
    <property>
    <name>hive.metastore.local</name>
    <value>true</value>
    </property>
    </configuration>
  4. 上传mysql-connect

    1
    cp mysql-connector-java-5.1.40-bin.jar /usr/local/hive/lib/
  5. 环境配置

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    vim /etc/profile

    export HBASE_HOME=/usr/local/hbase
    export HADOOP_HOME=/usr/local/hadoop
    export JAVA_HOME=/usr/local/jdk
    export ZK_HOME=/usr/local/zk
    export HIVE_HOME=/usr/local/hive
    export PATH=.:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$JAVA_HOME/bin:$ZK_HOME/bin:$HBASE_HOME/bin:$HIVE_HOME/bin:$PATH

    source /etc/profile
  6. 启动

    1
    2
    3
    4
    cd bin
    hive

    show tables;

推荐系统之sqoop环境搭建

  • 解压安装
    1
    2
    3
    4
    5
    6
    7
    8
    9
    tar -zxvf sqoop-1.4.6.bin__hadoop-2.0.4-alpha.tar.gz 
    mv sqoop-1.4.6.bin__hadoop-2.0.4-alpha /usr/local/sqoop
    cd /usr/local/sqoop
    cp mysql-connector-java-5.1.40-bin.jar /usr/local/sqoop/lib
    cp mysql-connector-java-5.1.35.jar /usr/local/sqoop/lib
    vim /etc/profile
    export SQOOP_HOME=/usr/local/sqoop
    export PATH=.:$SQOOP_HOME/bin:$PATH
    source /etc/profile
  1. 把mysql中的表复制到hdfs/hive中:
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    sqoop                       ##sqoop命令
    import ##表示导入
    --connect jdbc:mysql://ip:3306/sqoop ##告诉jdbc,连接mysql的url
    --username root ##连接mysql的用户名
    --password admin ##连接mysql的密码
    --table mysql1 ##从mysql导出的表名称
    --fields-terminated-by '\t' ##指定输出文件中的行的字段分隔符
    -m 1 ##复制过程使用1个map作业
    --hive-import ##把mysql表数据复制到hive空间中。如果不使用该选项,意味着复制到hdfs中


    增量导入
    参数
    说明
    --check-column (col)
    用来作为判断的列名,如id
    --incremental (mode)
    append:追加,比如对大于last-value指定的值之后的记录进行追加导入。lastmodified:最后的修改时间,追加last-value指定的日期之后的记录
    --last-value (value)
    指定自从上次导入后列的最大值(大于该指定的值),也可以自己设定某一值
    对incremental参数,如果是以日期作为追加导入的依据,则使用lastmodified,否则就使用append值


    导入到hive中:
    sqoop import --connect jdbc:mysql://hadoop0:3306/hive --username root --password my81527 --table TBLS --fields-terminated-by '\t' --null-string '**' --m 1 --append --hive-import


    CREATE TABLE t2(id int, name string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

    --query 'SELECT a.*, b.* FROM a JOIN b on (a.id == b.id) WHERE $CONDITIONS'

    hdfs dfs rm -r /user/hive/warehouse/*

数据库创建

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
hive
create database cargocn-cloud
use cargocn-cloud;

CREATE TABLE user(id string, name string,age string,address string,telphone string,qq string,weixin string,email string,sex string,birthday string,account string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

CREATE TABLE producttype(id string, producttypename string,producttypedescription string,typegrade string,parentid string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';


CREATE TABLE productdetail(id string,proudctid string,productplace string,productdescription string,productbrand string,productweight string,productspecification string,productdetaipicurl string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';


CREATE TABLE product(id string,producttypeid string,producttitle string,productprice string,mechartid string,createtime string,audittime string,auditstate string,stocknum string,sellnum string,productpicurl string,proudctstatus string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';


CREATE TABLE ordermain(id string,payamount string,userid string,createtime string,paytime string,paystatus string,consigneeadress string,consigneephone string,consigneename string,tradenumber string,paytype string,orderstatus string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

CREATE TABLE orderdetail(id string,orderid string,productid string,mechartid string,createtime string,tradenum string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

CREATE TABLE mechant(id string,merchantname string,merchantshopname string,merchantaccount string,mechantscope string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
  • sqoop同步命令
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    sqoop import --connect jdbc:mysql://10.211.55.3:3066/cargocn-cloud  --username root --password my81527 --query 'SELECT id,name,age,address,telphone,qq,weixin,email,sex,birthday,account FROM user WHERE $CONDITIONS' --fields-terminated-by '\t'  --null-string '**'  --target-dir /user/hive/warehouse/cargocn-cloud.db/user  --hive-table cargocn-cloud.user  --m 1 --hive-import

    sqoop import --connect jdbc:mysql://10.211.55.3:3066/cargocn-cloud --username root --password my81527 --table producttype --fields-terminated-by '\t' --null-string '**' --target-dir /user/hive/warehouse/cargocn-cloud.db/producttype --hive-table cargocn-cloud.producttype --m 1 --hive-import

    sqoop import --connect jdbc:mysql://10.211.55.3:3066/cargocn-cloud --username root --password my81527 --table productdetail --fields-terminated-by '\t' --null-string '**' --target-dir /user/hive/warehouse/cargocn-cloud.db/productdetail --hive-table cargocn-cloud.productdetail --m 1 --hive-import

    sqoop import --connect jdbc:mysql://10.211.55.3:3066/cargocn-cloud --username root --password my81527 --table product --fields-terminated-by '\t' --null-string '**' --m 1 --target-dir /user/hive/warehouse/cargocn-cloud.db/product --hive-table cargocn-cloud.product --hive-import

    sqoop import --connect jdbc:mysql://10.211.55.3:3066/cargocn-cloud --username root --password my81527 --table ordermain --fields-terminated-by '\t' --null-string '**' --m 1 --target-dir /user/hive/warehouse/cargocn-cloud.db/ordermain --hive-table cargocn-cloud.ordermain --hive-import

    sqoop import --connect jdbc:mysql://10.211.55.3:3066/cargocn-cloud --username root --password my81527 --table orderdetail --fields-terminated-by '\t' --null-string '**' --m 1 --target-dir /user/hive/warehouse/cargocn-cloud.db/orderdetail --hive-table cargocn-cloud.orderdetail --hive-import

    sqoop import --connect jdbc:mysql://10.211.55.3:3066/cargocn-cloud --username root --password my81527 --query 'SELECT id ,merchantname ,merchantshopname ,merchantaccount ,mechantscope FROM mechant WHERE $CONDITIONS' --fields-terminated-by '\t' --null-string '**' --m 1 --target-dir /user/hive/warehouse/cargocn-cloud.db/mechant --hive-table cargocn-cloud.mechant --hive-import

flume 收集

1
2
3
tar -zxvf apache-flume-1.6.0-bin.tar.gz 
mv apache-flume-1.6.0-bin /usr/local/flume
cd /usr/local/flume/conf
  • 修改配置文件 flume-conf.properties

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    productinfo.sources  = s1
    productinfo.channels = c1
    productinfo.sinks = s1

    productinfo.sources.s1.type = org.apache.flume.source.kafka.KafkaSource
    productinfo.sources.s1.zookeeperConnect = master:2181
    productinfo.sources.s1.topic = productscanlogflume
    productinfo.sources.s1.groupId = ty1
    productinfo.sources.s1.channels = c1
    productinfo.sources.s1.interceptors = i1
    productinfo.sources.s1.interceptors.i1.type = timestamp
    productinfo.sources.s1.kafka.consumer.timeout.ms = 1000

    productinfo.channels.c1.type = memory
    productinfo.channels.c1.capacity = 1000
    productinfo.channels.c1.transactionCapacity = 1000

    productinfo.sinks.s1.type = hdfs
    productinfo.sinks.s1.hdfs.path = /data/kafka/productinfo/%y-%m-%d
    productinfo.sinks.s1.hdfs.fileType = DataStream
    productinfo.sinks.s1.hdfs.rollSize = 0
    productinfo.sinks.s1.hdfs.rollCount = 0
    productinfo.sinks.s1.hdfs.rollInterval = 30
    productinfo.sinks.s1.channel = c1
  • 启动

    1
    /usr/local/flume/bin/flume-ng agent -f /usr/local/flume/conf/flume-conf.properties -n productinfo > productinfo.txt

推荐系统之spark mlib实现推荐代码编写

  • maven 依赖
    1
    2
    3
    4
    5
    6
     <dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-mllib_${scala.binary.version}</artifactId>
    <version>${project.version}</version>
    <scope>provided</scope>
    </dependency>
-------------本文结束感谢您的阅读-------------
undefined